summaryrefslogtreecommitdiffstats
path: root/src/os/bluestore/BlueStore.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/os/bluestore/BlueStore.cc')
-rw-r--r--src/os/bluestore/BlueStore.cc19631
1 files changed, 19631 insertions, 0 deletions
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
new file mode 100644
index 000000000..aa14d0204
--- /dev/null
+++ b/src/os/bluestore/BlueStore.cc
@@ -0,0 +1,19631 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <bit>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <algorithm>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real.hpp>
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "BlueStore.h"
+#include "bluestore_common.h"
+#include "simple_bitmap.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/intarith.h"
+#include "include/stringify.h"
+#include "include/str_map.h"
+#include "include/util.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/PriorityCache.h"
+#include "common/url_escape.h"
+#include "Allocator.h"
+#include "FreelistManager.h"
+#include "BlueFS.h"
+#include "BlueRocksEnv.h"
+#include "auth/Crypto.h"
+#include "common/EventTrace.h"
+#include "perfglue/heap_profiler.h"
+#include "common/blkdev.h"
+#include "common/numa.h"
+#include "common/pretty_binary.h"
+#include "kv/KeyValueHistogram.h"
+
+#ifdef HAVE_LIBZBD
+#include "ZonedAllocator.h"
+#include "ZonedFreelistManager.h"
+#endif
+
+#if defined(WITH_LTTNG)
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/bluestore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+
+using bid_t = decltype(BlueStore::Blob::id);
+
+// bluestore_cache_onode
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+ bluestore_cache_onode);
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
+ bluestore_cache_buffer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
+ bluestore_extent);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
+ bluestore_blob);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
+ bluestore_shared_blob);
+
+// bluestore_txc
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
+ bluestore_txc);
+using std::byte;
+using std::deque;
+using std::min;
+using std::make_pair;
+using std::numeric_limits;
+using std::pair;
+using std::less;
+using std::list;
+using std::make_unique;
+using std::map;
+using std::max;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::coarse_mono_clock;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_timespan;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+// kv store prefixes
+const string PREFIX_SUPER = "S"; // field -> value
+const string PREFIX_STAT = "T"; // field -> value(int64 array)
+const string PREFIX_COLL = "C"; // collection name -> cnode_t
+const string PREFIX_OBJ = "O"; // object name -> onode_t
+const string PREFIX_OMAP = "M"; // u64 + keyname -> value
+const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
+const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
+const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
+const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
+const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
+const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
+const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
+
+#ifdef HAVE_LIBZBD
+const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
+const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
+const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
+#endif
+
+const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
+
+// write a label in the first block. always use this size. note that
+// bluefs makes a matching assumption about the location of its
+// superblock (always the second block of the device).
+#define BDEV_LABEL_BLOCK_SIZE 4096
+
+// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
+#define SUPER_RESERVED 8192
+
+#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
+
+
+/*
+ * extent map blob encoding
+ *
+ * we use the low bits of the blobid field to indicate some common scenarios
+ * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
+ */
+#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
+#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
+#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
+#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
+#define BLOBID_SHIFT_BITS 4
+
+/*
+ * object name key structure
+ *
+ * encoded u8: shard + 2^7 (so that it sorts properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * escaped string: namespace
+ *
+ * escaped string: key or object name
+ * 1 char: '<', '=', or '>'. if =, then object key == object name, and
+ * we are done. otherwise, we are followed by the object name.
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ * 'o'
+ */
+#define ONODE_KEY_SUFFIX 'o'
+
+/*
+ * extent shard key
+ *
+ * object prefix key
+ * u32
+ * 'x'
+ */
+#define EXTENT_SHARD_KEY_SUFFIX 'x'
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does. We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ * NOTE: There is a bug in this implementation: due to implicit
+ * character type conversion in comparison it may produce unexpected
+ * ordering. Unfortunately fixing the bug would mean invalidating the
+ * keys in existing deployments. Instead we do additional sorting
+ * where it is needed.
+ */
+template<typename S>
+static void append_escaped(const string &in, S *out)
+{
+ char hexbyte[in.length() * 3 + 1];
+ char* ptr = &hexbyte[0];
+ for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+ if (*i <= '#') { // bug: unexpected result for *i > 0x7f
+ *ptr++ = '#';
+ *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+ *ptr++ = "0123456789abcdef"[*i & 0x0f];
+ } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
+ *ptr++ = '~';
+ *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+ *ptr++ = "0123456789abcdef"[*i & 0x0f];
+ } else {
+ *ptr++ = *i;
+ }
+ }
+ *ptr++ = '!';
+ out->append(hexbyte, ptr - &hexbyte[0]);
+}
+
+inline unsigned h2i(char c)
+{
+ if ((c >= '0') && (c <= '9')) {
+ return c - 0x30;
+ } else if ((c >= 'a') && (c <= 'f')) {
+ return c - 'a' + 10;
+ } else if ((c >= 'A') && (c <= 'F')) {
+ return c - 'A' + 10;
+ } else {
+ return 256; // make it always larger than 255
+ }
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+ char buff[256];
+ char* ptr = &buff[0];
+ char* max = &buff[252];
+ const char *orig_p = p;
+ while (*p && *p != '!') {
+ if (*p == '#' || *p == '~') {
+ unsigned hex = 0;
+ p++;
+ hex = h2i(*p++) << 4;
+ if (hex > 255) {
+ return -EINVAL;
+ }
+ hex |= h2i(*p++);
+ if (hex > 255) {
+ return -EINVAL;
+ }
+ *ptr++ = hex;
+ } else {
+ *ptr++ = *p++;
+ }
+ if (ptr > max) {
+ out->append(buff, ptr-buff);
+ ptr = &buff[0];
+ }
+ }
+ if (ptr != buff) {
+ out->append(buff, ptr-buff);
+ }
+ return p - orig_p;
+}
+
+template<typename T>
+static void _key_encode_shard(shard_id_t shard, T *key)
+{
+ key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
+}
+
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+ pshard->id = (uint8_t)*key - (uint8_t)0x80;
+ return key + 1;
+}
+
+static void get_coll_range(const coll_t& cid, int bits,
+ ghobject_t *temp_start, ghobject_t *temp_end,
+ ghobject_t *start, ghobject_t *end, bool legacy)
+{
+ spg_t pgid;
+ constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
+ // use different nspaces due to we use different schemes when encoding
+ // keys for listing objects
+ const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
+ if (cid.is_pg(&pgid)) {
+ start->shard_id = pgid.shard;
+ *temp_start = *start;
+
+ start->hobj.pool = pgid.pool();
+ temp_start->hobj.pool = -2ll - pgid.pool();
+
+ *end = *start;
+ *temp_end = *temp_start;
+
+ uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
+ start->hobj.set_bitwise_key_u32(reverse_hash);
+ temp_start->hobj.set_bitwise_key_u32(reverse_hash);
+
+ uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
+ if (end_hash > MAX_HASH) {
+ // make sure end hobj is even greater than the maximum possible hobj
+ end->hobj.set_bitwise_key_u32(MAX_HASH);
+ temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
+ end->hobj.nspace = MAX_NSPACE;
+ } else {
+ end->hobj.set_bitwise_key_u32(end_hash);
+ temp_end->hobj.set_bitwise_key_u32(end_hash);
+ }
+ } else {
+ start->shard_id = shard_id_t::NO_SHARD;
+ start->hobj.pool = -1ull;
+
+ *end = *start;
+ start->hobj.set_bitwise_key_u32(0);
+ end->hobj.set_bitwise_key_u32(MAX_HASH);
+ end->hobj.nspace = MAX_NSPACE;
+ // no separate temp section
+ *temp_start = *end;
+ *temp_end = *end;
+ }
+
+ start->generation = 0;
+ end->generation = 0;
+ temp_start->generation = 0;
+ temp_end->generation = 0;
+}
+
+static void get_shared_blob_key(uint64_t sbid, string *key)
+{
+ key->clear();
+ _key_encode_u64(sbid, key);
+}
+
+static int get_key_shared_blob(const string& key, uint64_t *sbid)
+{
+ const char *p = key.c_str();
+ if (key.length() < sizeof(uint64_t))
+ return -1;
+ _key_decode_u64(p, sbid);
+ return 0;
+}
+
+template<typename S>
+static void _key_encode_prefix(const ghobject_t& oid, S *key)
+{
+ _key_encode_shard(oid.shard_id, key);
+ _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+ _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+}
+
+static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
+{
+ p = _key_decode_shard(p, &oid->shard_id);
+
+ uint64_t pool;
+ p = _key_decode_u64(p, &pool);
+ oid->hobj.pool = pool - 0x8000000000000000ull;
+
+ unsigned hash;
+ p = _key_decode_u32(p, &hash);
+
+ oid->hobj.set_bitwise_key_u32(hash);
+
+ return p;
+}
+
+
+#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
+
+static int _get_key_object(const char *p, ghobject_t *oid)
+{
+ int r;
+
+ p = _key_decode_prefix(p, oid);
+
+ r = decode_escaped(p, &oid->hobj.nspace);
+ if (r < 0)
+ return -2;
+ p += r + 1;
+
+ string k;
+ r = decode_escaped(p, &k);
+ if (r < 0)
+ return -3;
+ p += r + 1;
+ if (*p == '=') {
+ // no key
+ ++p;
+ oid->hobj.oid.name = k;
+ } else if (*p == '<' || *p == '>') {
+ // key + name
+ ++p;
+ r = decode_escaped(p, &oid->hobj.oid.name);
+ if (r < 0)
+ return -5;
+ p += r + 1;
+ oid->hobj.set_key(k);
+ } else {
+ // malformed
+ return -6;
+ }
+
+ p = _key_decode_u64(p, &oid->hobj.snap.val);
+ p = _key_decode_u64(p, &oid->generation);
+
+ if (*p != ONODE_KEY_SUFFIX) {
+ return -7;
+ }
+ p++;
+ if (*p) {
+ // if we get something other than a null terminator here,
+ // something goes wrong.
+ return -8;
+ }
+
+ return 0;
+}
+
+template<typename S>
+static int get_key_object(const S& key, ghobject_t *oid)
+{
+ if (key.length() < ENCODED_KEY_PREFIX_LEN)
+ return -1;
+ if (key.length() == ENCODED_KEY_PREFIX_LEN)
+ return -2;
+ const char *p = key.c_str();
+ return _get_key_object(p, oid);
+}
+
+template<typename S>
+static void _get_object_key(const ghobject_t& oid, S *key)
+{
+ size_t max_len = ENCODED_KEY_PREFIX_LEN +
+ (oid.hobj.nspace.length() * 3 + 1) +
+ (oid.hobj.get_key().length() * 3 + 1) +
+ 1 + // for '<', '=', or '>'
+ (oid.hobj.oid.name.length() * 3 + 1) +
+ 8 + 8 + 1;
+ key->reserve(max_len);
+
+ _key_encode_prefix(oid, key);
+
+ append_escaped(oid.hobj.nspace, key);
+
+ if (oid.hobj.get_key().length()) {
+ // is a key... could be < = or >.
+ append_escaped(oid.hobj.get_key(), key);
+ // (ASCII chars < = and > sort in that order, yay)
+ int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
+ if (r) {
+ key->append(r > 0 ? ">" : "<");
+ append_escaped(oid.hobj.oid.name, key);
+ } else {
+ // same as no key
+ key->append("=");
+ }
+ } else {
+ // no key
+ append_escaped(oid.hobj.oid.name, key);
+ key->append("=");
+ }
+
+ _key_encode_u64(oid.hobj.snap, key);
+ _key_encode_u64(oid.generation, key);
+
+ key->push_back(ONODE_KEY_SUFFIX);
+}
+
+template<typename S>
+static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
+{
+ key->clear();
+ _get_object_key(oid, key);
+
+ // sanity check
+ if (true) {
+ ghobject_t t;
+ int r = get_key_object(*key, &t);
+ if (r || t != oid) {
+ derr << " r " << r << dendl;
+ derr << "key " << pretty_binary_string(*key) << dendl;
+ derr << "oid " << oid << dendl;
+ derr << " t " << t << dendl;
+ ceph_assert(r == 0 && t == oid);
+ }
+ }
+}
+
+// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
+// char lets us quickly test whether it is a shard key without decoding any
+// of the prefix bytes.
+template<typename S>
+static void get_extent_shard_key(const S& onode_key, uint32_t offset,
+ string *key)
+{
+ key->clear();
+ key->reserve(onode_key.length() + 4 + 1);
+ key->append(onode_key.c_str(), onode_key.size());
+ _key_encode_u32(offset, key);
+ key->push_back(EXTENT_SHARD_KEY_SUFFIX);
+}
+
+static void rewrite_extent_shard_key(uint32_t offset, string *key)
+{
+ ceph_assert(key->size() > sizeof(uint32_t) + 1);
+ ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+ _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
+}
+
+template<typename S>
+static void generate_extent_shard_key_and_apply(
+ const S& onode_key,
+ uint32_t offset,
+ string *key,
+ std::function<void(const string& final_key)> apply)
+{
+ if (key->empty()) { // make full key
+ ceph_assert(!onode_key.empty());
+ get_extent_shard_key(onode_key, offset, key);
+ } else {
+ rewrite_extent_shard_key(offset, key);
+ }
+ apply(*key);
+}
+
+int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
+{
+ ceph_assert(key.size() > sizeof(uint32_t) + 1);
+ ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+ int okey_len = key.size() - sizeof(uint32_t) - 1;
+ *onode_key = key.substr(0, okey_len);
+ const char *p = key.data() + okey_len;
+ _key_decode_u32(p, offset);
+ return 0;
+}
+
+static bool is_extent_shard_key(const string& key)
+{
+ return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
+}
+
+static void get_deferred_key(uint64_t seq, string *out)
+{
+ _key_encode_u64(seq, out);
+}
+
+static void get_pool_stat_key(int64_t pool_id, string *key)
+{
+ key->clear();
+ _key_encode_u64(pool_id, key);
+}
+
+static int get_key_pool_stat(const string& key, uint64_t* pool_id)
+{
+ const char *p = key.c_str();
+ if (key.length() < sizeof(uint64_t))
+ return -1;
+ _key_decode_u64(p, pool_id);
+ return 0;
+}
+
+#ifdef HAVE_LIBZBD
+static void get_zone_offset_object_key(
+ uint32_t zone,
+ uint64_t offset,
+ ghobject_t oid,
+ std::string *key)
+{
+ key->clear();
+ _key_encode_u32(zone, key);
+ _key_encode_u64(offset, key);
+ _get_object_key(oid, key);
+}
+
+static int get_key_zone_offset_object(
+ const string& key,
+ uint32_t *zone,
+ uint64_t *offset,
+ ghobject_t *oid)
+{
+ const char *p = key.c_str();
+ if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
+ return -1;
+ p = _key_decode_u32(p, zone);
+ p = _key_decode_u64(p, offset);
+ int r = _get_key_object(p, oid);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+#endif
+
+template <int LogLevelV>
+void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
+{
+ uint64_t pos = 0;
+ for (auto& s : em.shards) {
+ dout(LogLevelV) << __func__ << " shard " << *s.shard_info
+ << (s.loaded ? " (loaded)" : "")
+ << (s.dirty ? " (dirty)" : "")
+ << dendl;
+ }
+ for (auto& e : em.extent_map) {
+ dout(LogLevelV) << __func__ << " " << e << dendl;
+ ceph_assert(e.logical_offset >= pos);
+ pos = e.logical_offset + e.length;
+ const bluestore_blob_t& blob = e.blob->get_blob();
+ if (blob.has_csum()) {
+ vector<uint64_t> v;
+ unsigned n = blob.get_csum_count();
+ for (unsigned i = 0; i < n; ++i)
+ v.push_back(blob.get_csum_item(i));
+ dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
+ << dendl;
+ }
+ std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
+ for (auto& i : e.blob->shared_blob->bc.buffer_map) {
+ dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
+ << "~" << i.second->length << std::dec
+ << " " << *i.second << dendl;
+ }
+ }
+}
+
+template <int LogLevelV>
+void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
+{
+ if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
+ return;
+ dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
+ << " nid " << o.onode.nid
+ << " size 0x" << std::hex << o.onode.size
+ << " (" << std::dec << o.onode.size << ")"
+ << " expected_object_size " << o.onode.expected_object_size
+ << " expected_write_size " << o.onode.expected_write_size
+ << " in " << o.onode.extent_map_shards.size() << " shards"
+ << ", " << o.extent_map.spanning_blob_map.size()
+ << " spanning blobs"
+ << dendl;
+ for (auto& [zone, offset] : o.onode.zone_offset_refs) {
+ dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
+ << " offset 0x" << offset << std::dec << dendl;
+ }
+ for (auto p = o.onode.attrs.begin();
+ p != o.onode.attrs.end();
+ ++p) {
+ dout(LogLevelV) << __func__ << " attr " << p->first
+ << " len " << p->second.length() << dendl;
+ }
+ _dump_extent_map<LogLevelV>(cct, o.extent_map);
+}
+
+template <int LogLevelV>
+void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
+{
+ dout(LogLevelV) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t->dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+}
+
+// Buffer
+
+ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
+{
+ out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
+ << b.offset << "~" << b.length << std::dec
+ << " " << BlueStore::Buffer::get_state_name(b.state);
+ if (b.flags)
+ out << " " << BlueStore::Buffer::get_flag_name(b.flags);
+ return out << ")";
+}
+
+namespace {
+
+/*
+ * Due to a bug in key string encoding (see a comment for append_escaped)
+ * the KeyValueDB iterator does not lexicographically sort the same
+ * way that ghobject_t does: objects with the same hash may have wrong order.
+ *
+ * This is the iterator wrapper that fixes the keys order.
+ */
+
+class CollectionListIterator {
+public:
+ CollectionListIterator(const KeyValueDB::Iterator &it)
+ : m_it(it) {
+ }
+ virtual ~CollectionListIterator() {
+ }
+
+ virtual bool valid() const = 0;
+ virtual const ghobject_t &oid() const = 0;
+ virtual void lower_bound(const ghobject_t &oid) = 0;
+ virtual void upper_bound(const ghobject_t &oid) = 0;
+ virtual void next() = 0;
+
+ virtual int cmp(const ghobject_t &oid) const = 0;
+
+ bool is_ge(const ghobject_t &oid) const {
+ return cmp(oid) >= 0;
+ }
+
+ bool is_lt(const ghobject_t &oid) const {
+ return cmp(oid) < 0;
+ }
+
+protected:
+ KeyValueDB::Iterator m_it;
+};
+
+class SimpleCollectionListIterator : public CollectionListIterator {
+public:
+ SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
+ : CollectionListIterator(it), m_cct(cct) {
+ }
+
+ bool valid() const override {
+ return m_it->valid();
+ }
+
+ const ghobject_t &oid() const override {
+ ceph_assert(valid());
+
+ return m_oid;
+ }
+
+ void lower_bound(const ghobject_t &oid) override {
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ m_it->lower_bound(key);
+ get_oid();
+ }
+
+ void upper_bound(const ghobject_t &oid) override {
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ m_it->upper_bound(key);
+ get_oid();
+ }
+
+ void next() override {
+ ceph_assert(valid());
+
+ m_it->next();
+ get_oid();
+ }
+
+ int cmp(const ghobject_t &oid) const override {
+ ceph_assert(valid());
+
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ return m_it->key().compare(key);
+ }
+
+private:
+ CephContext *m_cct;
+ ghobject_t m_oid;
+
+ void get_oid() {
+ m_oid = ghobject_t();
+ while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+ m_it->next();
+ }
+ if (!valid()) {
+ return;
+ }
+
+ int r = get_key_object(m_it->key(), &m_oid);
+ ceph_assert(r == 0);
+ }
+};
+
+class SortedCollectionListIterator : public CollectionListIterator {
+public:
+ SortedCollectionListIterator(const KeyValueDB::Iterator &it)
+ : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
+ }
+
+ bool valid() const override {
+ return m_chunk_iter != m_chunk.end();
+ }
+
+ const ghobject_t &oid() const override {
+ ceph_assert(valid());
+
+ return m_chunk_iter->first;
+ }
+
+ void lower_bound(const ghobject_t &oid) override {
+ std::string key;
+ _key_encode_prefix(oid, &key);
+
+ m_it->lower_bound(key);
+ m_chunk_iter = m_chunk.end();
+ if (!get_next_chunk()) {
+ return;
+ }
+
+ if (this->oid().shard_id != oid.shard_id ||
+ this->oid().hobj.pool != oid.hobj.pool ||
+ this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ return;
+ }
+
+ m_chunk_iter = m_chunk.lower_bound(oid);
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+ void upper_bound(const ghobject_t &oid) override {
+ lower_bound(oid);
+
+ if (valid() && this->oid() == oid) {
+ next();
+ }
+ }
+
+ void next() override {
+ ceph_assert(valid());
+
+ m_chunk_iter++;
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+ int cmp(const ghobject_t &oid) const override {
+ ceph_assert(valid());
+
+ if (this->oid() < oid) {
+ return -1;
+ }
+ if (this->oid() > oid) {
+ return 1;
+ }
+ return 0;
+ }
+
+private:
+ std::map<ghobject_t, std::string> m_chunk;
+ std::map<ghobject_t, std::string>::iterator m_chunk_iter;
+
+ bool get_next_chunk() {
+ while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+ m_it->next();
+ }
+
+ if (!m_it->valid()) {
+ return false;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(m_it->key(), &oid);
+ ceph_assert(r == 0);
+
+ m_chunk.clear();
+ while (true) {
+ m_chunk.insert({oid, m_it->key()});
+
+ do {
+ m_it->next();
+ } while (m_it->valid() && is_extent_shard_key(m_it->key()));
+
+ if (!m_it->valid()) {
+ break;
+ }
+
+ ghobject_t next;
+ r = get_key_object(m_it->key(), &next);
+ ceph_assert(r == 0);
+ if (next.shard_id != oid.shard_id ||
+ next.hobj.pool != oid.hobj.pool ||
+ next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ break;
+ }
+ oid = next;
+ }
+
+ m_chunk_iter = m_chunk.begin();
+ return true;
+ }
+};
+
+} // anonymous namespace
+
+// Garbage Collector
+
+void BlueStore::GarbageCollector::process_protrusive_extents(
+ const BlueStore::ExtentMap& extent_map,
+ uint64_t start_offset,
+ uint64_t end_offset,
+ uint64_t start_touch_offset,
+ uint64_t end_touch_offset,
+ uint64_t min_alloc_size)
+{
+ ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
+
+ uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
+ uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
+
+ dout(30) << __func__ << " (hex): [" << std::hex
+ << lookup_start_offset << ", " << lookup_end_offset
+ << ")" << std::dec << dendl;
+
+ for (auto it = extent_map.seek_lextent(lookup_start_offset);
+ it != extent_map.extent_map.end() &&
+ it->logical_offset < lookup_end_offset;
+ ++it) {
+ uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
+ uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
+
+ dout(30) << __func__ << " " << *it
+ << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
+ << dendl;
+
+ Blob* b = it->blob.get();
+
+ if (it->logical_offset >=start_touch_offset &&
+ it->logical_end() <= end_touch_offset) {
+ // Process extents within the range affected by
+ // the current write request.
+ // Need to take into account if existing extents
+ // can be merged with them (uncompressed case)
+ if (!b->get_blob().is_compressed()) {
+ if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+ --blob_info_counted->expected_allocations; // don't need to allocate
+ // new AU for compressed
+ // data since another
+ // collocated uncompressed
+ // blob already exists
+ dout(30) << __func__ << " --expected:"
+ << alloc_unit_start << dendl;
+ }
+ used_alloc_unit = alloc_unit_end;
+ blob_info_counted = nullptr;
+ }
+ } else if (b->get_blob().is_compressed()) {
+
+ // additionally we take compressed blobs that were not impacted
+ // by the write into account too
+ BlobInfo& bi =
+ affected_blobs.emplace(
+ b, BlobInfo(b->get_referenced_bytes())).first->second;
+
+ int adjust =
+ (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
+ bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
+ dout(30) << __func__ << " expected_allocations="
+ << bi.expected_allocations << " end_au:"
+ << alloc_unit_end << dendl;
+
+ blob_info_counted = &bi;
+ used_alloc_unit = alloc_unit_end;
+
+ ceph_assert(it->length <= bi.referenced_bytes);
+ bi.referenced_bytes -= it->length;
+ dout(30) << __func__ << " affected_blob:" << *b
+ << " unref 0x" << std::hex << it->length
+ << " referenced = 0x" << bi.referenced_bytes
+ << std::dec << dendl;
+ // NOTE: we can't move specific blob to resulting GC list here
+ // when reference counter == 0 since subsequent extents might
+ // decrement its expected_allocation.
+ // Hence need to enumerate all the extents first.
+ if (!bi.collect_candidate) {
+ bi.first_lextent = it;
+ bi.collect_candidate = true;
+ }
+ bi.last_lextent = it;
+ } else {
+ if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+ // don't need to allocate new AU for compressed data since another
+ // collocated uncompressed blob already exists
+ --blob_info_counted->expected_allocations;
+ dout(30) << __func__ << " --expected_allocations:"
+ << alloc_unit_start << dendl;
+ }
+ used_alloc_unit = alloc_unit_end;
+ blob_info_counted = nullptr;
+ }
+ }
+
+ for (auto b_it = affected_blobs.begin();
+ b_it != affected_blobs.end();
+ ++b_it) {
+ Blob* b = b_it->first;
+ BlobInfo& bi = b_it->second;
+ if (bi.referenced_bytes == 0) {
+ uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
+ int64_t blob_expected_for_release =
+ round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
+
+ dout(30) << __func__ << " " << *(b_it->first)
+ << " expected4release=" << blob_expected_for_release
+ << " expected_allocations=" << bi.expected_allocations
+ << dendl;
+ int64_t benefit = blob_expected_for_release - bi.expected_allocations;
+ if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
+ if (bi.collect_candidate) {
+ auto it = bi.first_lextent;
+ bool bExit = false;
+ do {
+ if (it->blob.get() == b) {
+ extents_to_collect.insert(it->logical_offset, it->length);
+ }
+ bExit = it == bi.last_lextent;
+ ++it;
+ } while (!bExit);
+ }
+ expected_for_release += blob_expected_for_release;
+ expected_allocations += bi.expected_allocations;
+ }
+ }
+ }
+}
+
+int64_t BlueStore::GarbageCollector::estimate(
+ uint64_t start_offset,
+ uint64_t length,
+ const BlueStore::ExtentMap& extent_map,
+ const BlueStore::old_extent_map_t& old_extents,
+ uint64_t min_alloc_size)
+{
+
+ affected_blobs.clear();
+ extents_to_collect.clear();
+ used_alloc_unit = boost::optional<uint64_t >();
+ blob_info_counted = nullptr;
+
+ uint64_t gc_start_offset = start_offset;
+ uint64_t gc_end_offset = start_offset + length;
+
+ uint64_t end_offset = start_offset + length;
+
+ for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
+ Blob* b = it->e.blob.get();
+ if (b->get_blob().is_compressed()) {
+
+ // update gc_start_offset/gc_end_offset if needed
+ gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
+ gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
+
+ auto o = it->e.logical_offset;
+ auto l = it->e.length;
+
+ uint64_t ref_bytes = b->get_referenced_bytes();
+ // micro optimization to bypass blobs that have no more references
+ if (ref_bytes != 0) {
+ dout(30) << __func__ << " affected_blob:" << *b
+ << " unref 0x" << std::hex << o << "~" << l
+ << std::dec << dendl;
+ affected_blobs.emplace(b, BlobInfo(ref_bytes));
+ }
+ }
+ }
+ dout(30) << __func__ << " gc range(hex): [" << std::hex
+ << gc_start_offset << ", " << gc_end_offset
+ << ")" << std::dec << dendl;
+
+ // enumerate preceeding extents to check if they reference affected blobs
+ if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
+ process_protrusive_extents(extent_map,
+ gc_start_offset,
+ gc_end_offset,
+ start_offset,
+ end_offset,
+ min_alloc_size);
+ }
+ return expected_for_release - expected_allocations;
+}
+
+// LruOnodeCacheShard
+struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
+ typedef boost::intrusive::list<
+ BlueStore::Onode,
+ boost::intrusive::member_hook<
+ BlueStore::Onode,
+ boost::intrusive::list_member_hook<>,
+ &BlueStore::Onode::lru_item> > list_t;
+
+ list_t lru;
+
+ explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
+
+ void _add(BlueStore::Onode* o, int level) override
+ {
+ o->set_cached();
+ if (o->pin_nref == 1) {
+ (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
+ }
+ ++num; // we count both pinned and unpinned entries
+ dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
+ << num << dendl;
+ }
+ void _rm(BlueStore::Onode* o) override
+ {
+ o->clear_cached();
+ if (o->lru_item.is_linked()) {
+ *(o->cache_age_bin) -= 1;
+ lru.erase(lru.iterator_to(*o));
+ }
+ ceph_assert(num);
+ --num;
+ dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
+ }
+
+ void maybe_unpin(BlueStore::Onode* o) override
+ {
+ OnodeCacheShard* ocs = this;
+ ocs->lock.lock();
+ // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
+ while (ocs != o->c->get_onode_cache()) {
+ ocs->lock.unlock();
+ ocs = o->c->get_onode_cache();
+ ocs->lock.lock();
+ }
+ if (o->is_cached() && o->pin_nref == 1) {
+ if(!o->lru_item.is_linked()) {
+ if (o->exists) {
+ lru.push_front(*o);
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
+ dout(20) << __func__ << " " << this << " " << o->oid << " unpinned"
+ << dendl;
+ } else {
+ ceph_assert(num);
+ --num;
+ o->clear_cached();
+ dout(20) << __func__ << " " << this << " " << o->oid << " removed"
+ << dendl;
+ // remove will also decrement nref
+ o->c->onode_space._remove(o->oid);
+ }
+ } else if (o->exists) {
+ // move onode within LRU
+ lru.erase(lru.iterator_to(*o));
+ lru.push_front(*o);
+ if (o->cache_age_bin != age_bins.front()) {
+ *(o->cache_age_bin) -= 1;
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
+ }
+ dout(20) << __func__ << " " << this << " " << o->oid << " touched"
+ << dendl;
+ }
+ }
+ ocs->lock.unlock();
+ }
+
+ void _trim_to(uint64_t new_size) override
+ {
+ if (new_size >= lru.size()) {
+ return; // don't even try
+ }
+ uint64_t n = num - new_size; // note: we might get empty LRU
+ // before n == 0 due to pinned
+ // entries. And hence being unable
+ // to reach new_size target.
+ while (n-- > 0 && lru.size() > 0) {
+ BlueStore::Onode *o = &lru.back();
+ lru.pop_back();
+
+ dout(20) << __func__ << " rm " << o->oid << " "
+ << o->nref << " " << o->cached << dendl;
+
+ *(o->cache_age_bin) -= 1;
+ if (o->pin_nref > 1) {
+ dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl;
+ } else {
+ ceph_assert(num);
+ --num;
+ o->clear_cached();
+ o->c->onode_space._remove(o->oid);
+ }
+ }
+ }
+ void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
+ {
+ if (to == this) {
+ return;
+ }
+ _rm(o);
+ ceph_assert(o->nref > 1);
+ to->_add(o, 0);
+ }
+ void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
+ {
+ std::lock_guard l(lock);
+ *onodes += num;
+ *pinned_onodes += num - lru.size();
+ }
+};
+
+// OnodeCacheShard
+BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
+ CephContext* cct,
+ string type,
+ PerfCounters *logger)
+{
+ BlueStore::OnodeCacheShard *c = nullptr;
+ // Currently we only implement an LRU cache for onodes
+ c = new LruOnodeCacheShard(cct);
+ c->logger = logger;
+ return c;
+}
+
+// LruBufferCacheShard
+struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
+ typedef boost::intrusive::list<
+ BlueStore::Buffer,
+ boost::intrusive::member_hook<
+ BlueStore::Buffer,
+ boost::intrusive::list_member_hook<>,
+ &BlueStore::Buffer::lru_item> > list_t;
+ list_t lru;
+
+ explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
+
+ void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
+ if (near) {
+ auto q = lru.iterator_to(*near);
+ lru.insert(q, *b);
+ } else if (level > 0) {
+ lru.push_front(*b);
+ } else {
+ lru.push_back(*b);
+ }
+ buffer_bytes += b->length;
+ b->cache_age_bin = age_bins.front();
+ *(b->cache_age_bin) += b->length;
+ num = lru.size();
+ }
+ void _rm(BlueStore::Buffer *b) override {
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
+ auto q = lru.iterator_to(*b);
+ lru.erase(q);
+ num = lru.size();
+ }
+ void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
+ src->_rm(b);
+ _add(b, 0, nullptr);
+ }
+ void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
+ ceph_assert((int64_t)buffer_bytes + delta >= 0);
+ buffer_bytes += delta;
+ assert(*(b->cache_age_bin) + delta >= 0);
+ *(b->cache_age_bin) += delta;
+ }
+ void _touch(BlueStore::Buffer *b) override {
+ auto p = lru.iterator_to(*b);
+ lru.erase(p);
+ lru.push_front(*b);
+ *(b->cache_age_bin) -= b->length;
+ b->cache_age_bin = age_bins.front();
+ *(b->cache_age_bin) += b->length;
+ num = lru.size();
+ _audit("_touch_buffer end");
+ }
+
+ void _trim_to(uint64_t max) override
+ {
+ while (buffer_bytes > max) {
+ auto i = lru.rbegin();
+ if (i == lru.rend()) {
+ // stop if lru is now empty
+ break;
+ }
+
+ BlueStore::Buffer *b = &*i;
+ ceph_assert(b->is_clean());
+ dout(20) << __func__ << " rm " << *b << dendl;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
+ b->space->_rm_buffer(this, b);
+ }
+ num = lru.size();
+ }
+
+ void add_stats(uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) override {
+ *extents += num_extents;
+ *blobs += num_blobs;
+ *buffers += num;
+ *bytes += buffer_bytes;
+ }
+#ifdef DEBUG_CACHE
+ void _audit(const char *s) override
+ {
+ dout(10) << __func__ << " " << when << " start" << dendl;
+ uint64_t s = 0;
+ for (auto i = lru.begin(); i != lru.end(); ++i) {
+ s += i->length;
+ }
+ if (s != buffer_bytes) {
+ derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
+ << dendl;
+ for (auto i = lru.begin(); i != lru.end(); ++i) {
+ derr << __func__ << " " << *i << dendl;
+ }
+ ceph_assert(s == buffer_bytes);
+ }
+ dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+ << " ok" << dendl;
+ }
+#endif
+};
+
+// TwoQBufferCacheShard
+
+struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
+ typedef boost::intrusive::list<
+ BlueStore::Buffer,
+ boost::intrusive::member_hook<
+ BlueStore::Buffer,
+ boost::intrusive::list_member_hook<>,
+ &BlueStore::Buffer::lru_item> > list_t;
+ list_t hot; ///< "Am" hot buffers
+ list_t warm_in; ///< "A1in" newly warm buffers
+ list_t warm_out; ///< "A1out" empty buffers we've evicted
+
+ enum {
+ BUFFER_NEW = 0,
+ BUFFER_WARM_IN, ///< in warm_in
+ BUFFER_WARM_OUT, ///< in warm_out
+ BUFFER_HOT, ///< in hot
+ BUFFER_TYPE_MAX
+ };
+
+ uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
+
+public:
+ explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
+
+ void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
+ {
+ dout(20) << __func__ << " level " << level << " near " << near
+ << " on " << *b
+ << " which has cache_private " << b->cache_private << dendl;
+ if (near) {
+ b->cache_private = near->cache_private;
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ warm_in.insert(warm_in.iterator_to(*near), *b);
+ break;
+ case BUFFER_WARM_OUT:
+ ceph_assert(b->is_empty());
+ warm_out.insert(warm_out.iterator_to(*near), *b);
+ break;
+ case BUFFER_HOT:
+ hot.insert(hot.iterator_to(*near), *b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ } else if (b->cache_private == BUFFER_NEW) {
+ b->cache_private = BUFFER_WARM_IN;
+ if (level > 0) {
+ warm_in.push_front(*b);
+ } else {
+ // take caller hint to start at the back of the warm queue
+ warm_in.push_back(*b);
+ }
+ } else {
+ // we got a hint from discard
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ // stay in warm_in. move to front, even though 2Q doesn't actually
+ // do this.
+ dout(20) << __func__ << " move to front of warm " << *b << dendl;
+ warm_in.push_front(*b);
+ break;
+ case BUFFER_WARM_OUT:
+ b->cache_private = BUFFER_HOT;
+ // move to hot. fall-thru
+ case BUFFER_HOT:
+ dout(20) << __func__ << " move to front of hot " << *b << dendl;
+ hot.push_front(*b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ }
+ b->cache_age_bin = age_bins.front();
+ if (!b->is_empty()) {
+ buffer_bytes += b->length;
+ list_bytes[b->cache_private] += b->length;
+ *(b->cache_age_bin) += b->length;
+ }
+ num = hot.size() + warm_in.size();
+ }
+
+ void _rm(BlueStore::Buffer *b) override
+ {
+ dout(20) << __func__ << " " << *b << dendl;
+ if (!b->is_empty()) {
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ ceph_assert(list_bytes[b->cache_private] >= b->length);
+ list_bytes[b->cache_private] -= b->length;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
+ }
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ warm_in.erase(warm_in.iterator_to(*b));
+ break;
+ case BUFFER_WARM_OUT:
+ warm_out.erase(warm_out.iterator_to(*b));
+ break;
+ case BUFFER_HOT:
+ hot.erase(hot.iterator_to(*b));
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ num = hot.size() + warm_in.size();
+ }
+
+ void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
+ {
+ TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
+ src->_rm(b);
+
+ // preserve which list we're on (even if we can't preserve the order!)
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ ceph_assert(!b->is_empty());
+ warm_in.push_back(*b);
+ break;
+ case BUFFER_WARM_OUT:
+ ceph_assert(b->is_empty());
+ warm_out.push_back(*b);
+ break;
+ case BUFFER_HOT:
+ ceph_assert(!b->is_empty());
+ hot.push_back(*b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ if (!b->is_empty()) {
+ buffer_bytes += b->length;
+ list_bytes[b->cache_private] += b->length;
+ *(b->cache_age_bin) += b->length;
+ }
+ num = hot.size() + warm_in.size();
+ }
+
+ void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
+ {
+ dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
+ if (!b->is_empty()) {
+ ceph_assert((int64_t)buffer_bytes + delta >= 0);
+ buffer_bytes += delta;
+ ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
+ list_bytes[b->cache_private] += delta;
+ assert(*(b->cache_age_bin) + delta >= 0);
+ *(b->cache_age_bin) += delta;
+ }
+ }
+
+ void _touch(BlueStore::Buffer *b) override {
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ // do nothing (somewhat counter-intuitively!)
+ break;
+ case BUFFER_WARM_OUT:
+ // move from warm_out to hot LRU
+ ceph_abort_msg("this happens via discard hint");
+ break;
+ case BUFFER_HOT:
+ // move to front of hot LRU
+ hot.erase(hot.iterator_to(*b));
+ hot.push_front(*b);
+ break;
+ }
+ *(b->cache_age_bin) -= b->length;
+ b->cache_age_bin = age_bins.front();
+ *(b->cache_age_bin) += b->length;
+ num = hot.size() + warm_in.size();
+ _audit("_touch_buffer end");
+ }
+
+ void _trim_to(uint64_t max) override
+ {
+ if (buffer_bytes > max) {
+ uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
+ uint64_t khot = max - kin;
+
+ // pre-calculate kout based on average buffer size too,
+ // which is typical(the warm_in and hot lists may change later)
+ uint64_t kout = 0;
+ uint64_t buffer_num = hot.size() + warm_in.size();
+ if (buffer_num) {
+ uint64_t avg_size = buffer_bytes / buffer_num;
+ ceph_assert(avg_size);
+ uint64_t calculated_num = max / avg_size;
+ kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
+ }
+
+ if (list_bytes[BUFFER_HOT] < khot) {
+ // hot is small, give slack to warm_in
+ kin += khot - list_bytes[BUFFER_HOT];
+ } else if (list_bytes[BUFFER_WARM_IN] < kin) {
+ // warm_in is small, give slack to hot
+ khot += kin - list_bytes[BUFFER_WARM_IN];
+ }
+
+ // adjust warm_in list
+ int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
+ uint64_t evicted = 0;
+
+ while (to_evict_bytes > 0) {
+ auto p = warm_in.rbegin();
+ if (p == warm_in.rend()) {
+ // stop if warm_in list is now empty
+ break;
+ }
+
+ BlueStore::Buffer *b = &*p;
+ ceph_assert(b->is_clean());
+ dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
+ list_bytes[BUFFER_WARM_IN] -= b->length;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
+ to_evict_bytes -= b->length;
+ evicted += b->length;
+ b->state = BlueStore::Buffer::STATE_EMPTY;
+ b->data.clear();
+ warm_in.erase(warm_in.iterator_to(*b));
+ warm_out.push_front(*b);
+ b->cache_private = BUFFER_WARM_OUT;
+ }
+
+ if (evicted > 0) {
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+ << " from warm_in list, done evicting warm_in buffers"
+ << dendl;
+ }
+
+ // adjust hot list
+ to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
+ evicted = 0;
+
+ while (to_evict_bytes > 0) {
+ auto p = hot.rbegin();
+ if (p == hot.rend()) {
+ // stop if hot list is now empty
+ break;
+ }
+
+ BlueStore::Buffer *b = &*p;
+ dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
+ ceph_assert(b->is_clean());
+ // adjust evict size before buffer goes invalid
+ to_evict_bytes -= b->length;
+ evicted += b->length;
+ b->space->_rm_buffer(this, b);
+ }
+
+ if (evicted > 0) {
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+ << " from hot list, done evicting hot buffers"
+ << dendl;
+ }
+
+ // adjust warm out list too, if necessary
+ int64_t n = warm_out.size() - kout;
+ while (n-- > 0) {
+ BlueStore::Buffer *b = &*warm_out.rbegin();
+ ceph_assert(b->is_empty());
+ dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
+ b->space->_rm_buffer(this, b);
+ }
+ }
+ num = hot.size() + warm_in.size();
+ }
+
+ void add_stats(uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) override {
+ *extents += num_extents;
+ *blobs += num_blobs;
+ *buffers += num;
+ *bytes += buffer_bytes;
+ }
+
+#ifdef DEBUG_CACHE
+ void _audit(const char *s) override
+ {
+ dout(10) << __func__ << " " << when << " start" << dendl;
+ uint64_t s = 0;
+ for (auto i = hot.begin(); i != hot.end(); ++i) {
+ s += i->length;
+ }
+
+ uint64_t hot_bytes = s;
+ if (hot_bytes != list_bytes[BUFFER_HOT]) {
+ derr << __func__ << " hot_list_bytes "
+ << list_bytes[BUFFER_HOT]
+ << " != actual " << hot_bytes
+ << dendl;
+ ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
+ }
+
+ for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
+ s += i->length;
+ }
+
+ uint64_t warm_in_bytes = s - hot_bytes;
+ if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
+ derr << __func__ << " warm_in_list_bytes "
+ << list_bytes[BUFFER_WARM_IN]
+ << " != actual " << warm_in_bytes
+ << dendl;
+ ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
+ }
+
+ if (s != buffer_bytes) {
+ derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
+ << dendl;
+ ceph_assert(s == buffer_bytes);
+ }
+
+ dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+ << " ok" << dendl;
+ }
+#endif
+};
+
+// BuferCacheShard
+
+BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
+ CephContext* cct,
+ string type,
+ PerfCounters *logger)
+{
+ BufferCacheShard *c = nullptr;
+ if (type == "lru")
+ c = new LruBufferCacheShard(cct);
+ else if (type == "2q")
+ c = new TwoQBufferCacheShard(cct);
+ else
+ ceph_abort_msg("unrecognized cache type");
+ c->logger = logger;
+ return c;
+}
+
+// BufferSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
+
+void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
+{
+ // note: we already hold cache->lock
+ ldout(cache->cct, 20) << __func__ << dendl;
+ while (!buffer_map.empty()) {
+ _rm_buffer(cache, buffer_map.begin());
+ }
+}
+
+int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
+{
+ // note: we already hold cache->lock
+ ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
+ << std::dec << dendl;
+ int cache_private = 0;
+ cache->_audit("discard start");
+ auto i = _data_lower_bound(offset);
+ uint32_t end = offset + length;
+ while (i != buffer_map.end()) {
+ Buffer *b = i->second.get();
+ if (b->offset >= end) {
+ break;
+ }
+ if (b->cache_private > cache_private) {
+ cache_private = b->cache_private;
+ }
+ if (b->offset < offset) {
+ int64_t front = offset - b->offset;
+ if (b->end() > end) {
+ // drop middle (split)
+ uint32_t tail = b->end() - end;
+ if (b->data.length()) {
+ bufferlist bl;
+ bl.substr_of(b->data, b->length - tail, tail);
+ Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
+ nb->maybe_rebuild();
+ _add_buffer(cache, nb, 0, b);
+ } else {
+ _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
+ b->flags),
+ 0, b);
+ }
+ if (!b->is_writing()) {
+ cache->_adjust_size(b, front - (int64_t)b->length);
+ }
+ b->truncate(front);
+ b->maybe_rebuild();
+ cache->_audit("discard end 1");
+ break;
+ } else {
+ // drop tail
+ if (!b->is_writing()) {
+ cache->_adjust_size(b, front - (int64_t)b->length);
+ }
+ b->truncate(front);
+ b->maybe_rebuild();
+ ++i;
+ continue;
+ }
+ }
+ if (b->end() <= end) {
+ // drop entire buffer
+ _rm_buffer(cache, i++);
+ continue;
+ }
+ // drop front
+ uint32_t keep = b->end() - end;
+ if (b->data.length()) {
+ bufferlist bl;
+ bl.substr_of(b->data, b->length - keep, keep);
+ Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
+ nb->maybe_rebuild();
+ _add_buffer(cache, nb, 0, b);
+ } else {
+ _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
+ b->flags),
+ 0, b);
+ }
+ _rm_buffer(cache, i);
+ cache->_audit("discard end 2");
+ break;
+ }
+ return cache_private;
+}
+
+void BlueStore::BufferSpace::read(
+ BufferCacheShard* cache,
+ uint32_t offset,
+ uint32_t length,
+ BlueStore::ready_regions_t& res,
+ interval_set<uint32_t>& res_intervals,
+ int flags)
+{
+ res.clear();
+ res_intervals.clear();
+ uint32_t want_bytes = length;
+ uint32_t end = offset + length;
+
+ {
+ std::lock_guard l(cache->lock);
+ for (auto i = _data_lower_bound(offset);
+ i != buffer_map.end() && offset < end && i->first < end;
+ ++i) {
+ Buffer *b = i->second.get();
+ ceph_assert(b->end() > offset);
+
+ bool val = false;
+ if (flags & BYPASS_CLEAN_CACHE)
+ val = b->is_writing();
+ else
+ val = b->is_writing() || b->is_clean();
+ if (val) {
+ if (b->offset < offset) {
+ uint32_t skip = offset - b->offset;
+ uint32_t l = min(length, b->length - skip);
+ res[offset].substr_of(b->data, skip, l);
+ res_intervals.insert(offset, l);
+ offset += l;
+ length -= l;
+ if (!b->is_writing()) {
+ cache->_touch(b);
+ }
+ continue;
+ }
+ if (b->offset > offset) {
+ uint32_t gap = b->offset - offset;
+ if (length <= gap) {
+ break;
+ }
+ offset += gap;
+ length -= gap;
+ }
+ if (!b->is_writing()) {
+ cache->_touch(b);
+ }
+ if (b->length > length) {
+ res[offset].substr_of(b->data, 0, length);
+ res_intervals.insert(offset, length);
+ break;
+ } else {
+ res[offset].append(b->data);
+ res_intervals.insert(offset, b->length);
+ if (b->length == length)
+ break;
+ offset += b->length;
+ length -= b->length;
+ }
+ }
+ }
+ }
+
+ uint64_t hit_bytes = res_intervals.size();
+ ceph_assert(hit_bytes <= want_bytes);
+ uint64_t miss_bytes = want_bytes - hit_bytes;
+ cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
+ cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
+}
+
+void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
+{
+ auto i = writing.begin();
+ while (i != writing.end()) {
+ if (i->seq > seq) {
+ break;
+ }
+ if (i->seq < seq) {
+ ++i;
+ continue;
+ }
+
+ Buffer *b = &*i;
+ ceph_assert(b->is_writing());
+
+ if (b->flags & Buffer::FLAG_NOCACHE) {
+ writing.erase(i++);
+ ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
+ buffer_map.erase(b->offset);
+ } else {
+ b->state = Buffer::STATE_CLEAN;
+ writing.erase(i++);
+ b->maybe_rebuild();
+ b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+ cache->_add(b, 1, nullptr);
+ ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
+ }
+ }
+ cache->_trim();
+ cache->_audit("finish_write end");
+}
+
+void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
+{
+ std::lock_guard lk(cache->lock);
+ if (buffer_map.empty())
+ return;
+
+ auto p = --buffer_map.end();
+ while (true) {
+ if (p->second->end() <= pos)
+ break;
+
+ if (p->second->offset < pos) {
+ ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
+ size_t left = pos - p->second->offset;
+ size_t right = p->second->length - left;
+ if (p->second->data.length()) {
+ bufferlist bl;
+ bl.substr_of(p->second->data, left, right);
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+ 0, bl, p->second->flags),
+ 0, p->second.get());
+ } else {
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+ 0, right, p->second->flags),
+ 0, p->second.get());
+ }
+ cache->_adjust_size(p->second.get(), -right);
+ p->second->truncate(left);
+ break;
+ }
+
+ ceph_assert(p->second->end() > pos);
+ ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
+ if (p->second->data.length()) {
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+ p->second->offset - pos, p->second->data, p->second->flags),
+ 0, p->second.get());
+ } else {
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+ p->second->offset - pos, p->second->length, p->second->flags),
+ 0, p->second.get());
+ }
+ if (p == buffer_map.begin()) {
+ _rm_buffer(cache, p);
+ break;
+ } else {
+ _rm_buffer(cache, p--);
+ }
+ }
+ ceph_assert(writing.empty());
+ cache->_trim();
+}
+
+// OnodeSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::add_onode(const ghobject_t& oid,
+ OnodeRef& o)
+{
+ std::lock_guard l(cache->lock);
+ // add entry or return existing one
+ auto p = onode_map.emplace(oid, o);
+ if (!p.second) {
+ ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
+ << " raced, returning existing " << p.first->second
+ << dendl;
+ return p.first->second;
+ }
+ ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
+ cache->_add(o.get(), 1);
+ cache->_trim();
+ return o;
+}
+
+void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
+{
+ ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
+ onode_map.erase(oid);
+}
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
+{
+ ldout(cache->cct, 30) << __func__ << dendl;
+ OnodeRef o;
+
+ {
+ std::lock_guard l(cache->lock);
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+ if (p == onode_map.end()) {
+ ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
+ cache->logger->inc(l_bluestore_onode_misses);
+ } else {
+ ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
+ << " " << p->second->nref
+ << " " << p->second->cached
+ << dendl;
+ // This will pin onode and implicitly touch the cache when Onode
+ // eventually will become unpinned
+ o = p->second;
+
+ cache->logger->inc(l_bluestore_onode_hits);
+ }
+ }
+
+ return o;
+}
+
+void BlueStore::OnodeSpace::clear()
+{
+ std::lock_guard l(cache->lock);
+ ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
+ for (auto &p : onode_map) {
+ cache->_rm(p.second.get());
+ }
+ onode_map.clear();
+}
+
+bool BlueStore::OnodeSpace::empty()
+{
+ std::lock_guard l(cache->lock);
+ return onode_map.empty();
+}
+
+void BlueStore::OnodeSpace::rename(
+ OnodeRef& oldo,
+ const ghobject_t& old_oid,
+ const ghobject_t& new_oid,
+ const mempool::bluestore_cache_meta::string& new_okey)
+{
+ std::lock_guard l(cache->lock);
+ ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
+ << dendl;
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+ po = onode_map.find(old_oid);
+ pn = onode_map.find(new_oid);
+ ceph_assert(po != pn);
+
+ ceph_assert(po != onode_map.end());
+ if (pn != onode_map.end()) {
+ ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
+ << dendl;
+ cache->_rm(pn->second.get());
+ onode_map.erase(pn);
+ }
+ OnodeRef o = po->second;
+
+ // install a non-existent onode at old location
+ oldo.reset(new Onode(o->c, old_oid, o->key));
+ po->second = oldo;
+ cache->_add(oldo.get(), 1);
+ // add at new position and fix oid, key.
+ // This will pin 'o' and implicitly touch cache
+ // when it will eventually become unpinned
+ onode_map.insert(make_pair(new_oid, o));
+
+ o->oid = new_oid;
+ o->key = new_okey;
+ cache->_trim();
+}
+
+bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
+{
+ std::lock_guard l(cache->lock);
+ ldout(cache->cct, 20) << __func__ << dendl;
+ for (auto& i : onode_map) {
+ if (f(i.second.get())) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <int LogLevelV = 30>
+void BlueStore::OnodeSpace::dump(CephContext *cct)
+{
+ for (auto& i : onode_map) {
+ ldout(cct, LogLevelV) << i.first << " : " << i.second
+ << " " << i.second->nref
+ << " " << i.second->cached
+ << dendl;
+ }
+}
+
+// SharedBlob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
+#undef dout_context
+#define dout_context coll->store->cct
+
+void BlueStore::SharedBlob::dump(Formatter* f) const
+{
+ f->dump_bool("loaded", loaded);
+ if (loaded) {
+ persistent->dump(f);
+ } else {
+ f->dump_unsigned("sbid_unloaded", sbid_unloaded);
+ }
+}
+
+ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
+{
+ out << "SharedBlob(" << &sb;
+
+ if (sb.loaded) {
+ out << " loaded " << *sb.persistent;
+ } else {
+ out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
+ }
+ return out << ")";
+}
+
+BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
+ : coll(_coll), sbid_unloaded(i)
+{
+ ceph_assert(sbid_unloaded > 0);
+ if (get_cache()) {
+ get_cache()->add_blob();
+ }
+}
+
+BlueStore::SharedBlob::~SharedBlob()
+{
+ if (loaded && persistent) {
+ delete persistent;
+ }
+}
+
+void BlueStore::SharedBlob::put()
+{
+ if (--nref == 0) {
+ dout(20) << __func__ << " " << this
+ << " removing self from set " << get_parent()
+ << dendl;
+ again:
+ auto coll_snap = coll;
+ if (coll_snap) {
+ std::lock_guard l(coll_snap->cache->lock);
+ if (coll_snap != coll) {
+ goto again;
+ }
+ if (!coll_snap->shared_blob_set.remove(this, true)) {
+ // race with lookup
+ return;
+ }
+ bc._clear(coll_snap->cache);
+ coll_snap->cache->rm_blob();
+ }
+ delete this;
+ }
+}
+
+void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
+{
+ ceph_assert(persistent);
+ persistent->ref_map.get(offset, length);
+}
+
+void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
+ PExtentVector *r,
+ bool *unshare)
+{
+ ceph_assert(persistent);
+ persistent->ref_map.put(offset, length, r,
+ unshare && !*unshare ? unshare : nullptr);
+}
+
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+ while (true) {
+ BufferCacheShard *cache = coll->cache;
+ std::lock_guard l(cache->lock);
+ if (coll->cache != cache) {
+ dout(20) << __func__
+ << " raced with sb cache update, was " << cache
+ << ", now " << coll->cache << ", retrying"
+ << dendl;
+ continue;
+ }
+ bc._finish_write(cache, seq);
+ break;
+ }
+}
+
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+template <int LogLevelV = 30>
+void BlueStore::SharedBlobSet::dump(CephContext *cct)
+{
+ std::lock_guard l(lock);
+ for (auto& i : sb_map) {
+ ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
+ }
+}
+
+// Blob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
+
+void BlueStore::Blob::dump(Formatter* f) const
+{
+ if (is_spanning()) {
+ f->dump_unsigned("spanning_id ", id);
+ }
+ blob.dump(f);
+ if (shared_blob) {
+ f->dump_object("shared", *shared_blob);
+ }
+}
+
+ostream& operator<<(ostream& out, const BlueStore::Blob& b)
+{
+ out << "Blob(" << &b;
+ if (b.is_spanning()) {
+ out << " spanning " << b.id;
+ }
+ out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
+ if (b.shared_blob) {
+ out << " " << *b.shared_blob;
+ } else {
+ out << " (shared_blob=NULL)";
+ }
+ out << ")";
+ return out;
+}
+
+void BlueStore::Blob::discard_unallocated(Collection *coll)
+{
+ if (get_blob().is_shared()) {
+ return;
+ }
+ if (get_blob().is_compressed()) {
+ bool discard = false;
+ bool all_invalid = true;
+ for (auto e : get_blob().get_extents()) {
+ if (!e.is_valid()) {
+ discard = true;
+ } else {
+ all_invalid = false;
+ }
+ }
+ ceph_assert(discard == all_invalid); // in case of compressed blob all
+ // or none pextents are invalid.
+ if (discard) {
+ shared_blob->bc.discard(shared_blob->get_cache(), 0,
+ get_blob().get_logical_length());
+ }
+ } else {
+ size_t pos = 0;
+ for (auto e : get_blob().get_extents()) {
+ if (!e.is_valid()) {
+ dout(20) << __func__ << " 0x" << std::hex << pos
+ << "~" << e.length
+ << std::dec << dendl;
+ shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
+ }
+ pos += e.length;
+ }
+ if (get_blob().can_prune_tail()) {
+ dirty_blob().prune_tail();
+ used_in_blob.prune_tail(get_blob().get_ondisk_length());
+ dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
+ }
+ }
+}
+
+void BlueStore::Blob::get_ref(
+ Collection *coll,
+ uint32_t offset,
+ uint32_t length)
+{
+ // Caller has to initialize Blob's logical length prior to increment
+ // references. Otherwise one is neither unable to determine required
+ // amount of counters in case of per-au tracking nor obtain min_release_size
+ // for single counter mode.
+ ceph_assert(get_blob().get_logical_length() != 0);
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << " " << *this << dendl;
+
+ if (used_in_blob.is_empty()) {
+ uint32_t min_release_size =
+ get_blob().get_release_size(coll->store->min_alloc_size);
+ uint64_t l = get_blob().get_logical_length();
+ dout(20) << __func__ << " init 0x" << std::hex << l << ", "
+ << min_release_size << std::dec << dendl;
+ used_in_blob.init(l, min_release_size);
+ }
+ used_in_blob.get(
+ offset,
+ length);
+}
+
+bool BlueStore::Blob::put_ref(
+ Collection *coll,
+ uint32_t offset,
+ uint32_t length,
+ PExtentVector *r)
+{
+ PExtentVector logical;
+
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << " " << *this << dendl;
+
+ bool empty = used_in_blob.put(
+ offset,
+ length,
+ &logical);
+ r->clear();
+ // nothing to release
+ if (!empty && logical.empty()) {
+ return false;
+ }
+
+ bluestore_blob_t& b = dirty_blob();
+ return b.release_extents(empty, logical, r);
+}
+
+bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
+ uint32_t target_blob_size,
+ uint32_t b_offset,
+ uint32_t *length0) {
+ ceph_assert(min_alloc_size);
+ ceph_assert(target_blob_size);
+ if (!get_blob().is_mutable()) {
+ return false;
+ }
+
+ uint32_t length = *length0;
+ uint32_t end = b_offset + length;
+
+ // Currently for the sake of simplicity we omit blob reuse if data is
+ // unaligned with csum chunk. Later we can perform padding if needed.
+ if (get_blob().has_csum() &&
+ ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
+ (end % get_blob().get_csum_chunk_size()) != 0)) {
+ return false;
+ }
+
+ auto blen = get_blob().get_logical_length();
+ uint32_t new_blen = blen;
+
+ // make sure target_blob_size isn't less than current blob len
+ target_blob_size = std::max(blen, target_blob_size);
+
+ if (b_offset >= blen) {
+ // new data totally stands out of the existing blob
+ new_blen = end;
+ } else {
+ // new data overlaps with the existing blob
+ new_blen = std::max(blen, end);
+
+ uint32_t overlap = 0;
+ if (new_blen > blen) {
+ overlap = blen - b_offset;
+ } else {
+ overlap = length;
+ }
+
+ if (!get_blob().is_unallocated(b_offset, overlap)) {
+ // abort if any piece of the overlap has already been allocated
+ return false;
+ }
+ }
+
+ if (new_blen > blen) {
+ int64_t overflow = int64_t(new_blen) - target_blob_size;
+ // Unable to decrease the provided length to fit into max_blob_size
+ if (overflow >= length) {
+ return false;
+ }
+
+ // FIXME: in some cases we could reduce unused resolution
+ if (get_blob().has_unused()) {
+ return false;
+ }
+
+ if (overflow > 0) {
+ new_blen -= overflow;
+ length -= overflow;
+ *length0 = length;
+ }
+
+ if (new_blen > blen) {
+ dirty_blob().add_tail(new_blen);
+ used_in_blob.add_tail(new_blen,
+ get_blob().get_release_size(min_alloc_size));
+ }
+ }
+ return true;
+}
+
+void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
+{
+ dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+ << " start " << *this << dendl;
+ ceph_assert(blob.can_split());
+ ceph_assert(used_in_blob.can_split());
+ bluestore_blob_t &lb = dirty_blob();
+ bluestore_blob_t &rb = r->dirty_blob();
+
+ used_in_blob.split(
+ blob_offset,
+ &(r->used_in_blob));
+
+ lb.split(blob_offset, rb);
+ shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
+
+ dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+ << " finish " << *this << dendl;
+ dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+ << " and " << *r << dendl;
+}
+
+#ifndef CACHE_BLOB_BL
+void BlueStore::Blob::decode(
+ bufferptr::const_iterator& p,
+ uint64_t struct_v,
+ uint64_t* sbid,
+ bool include_ref_map,
+ Collection *coll)
+{
+ denc(blob, p, struct_v);
+ if (blob.is_shared()) {
+ denc(*sbid, p);
+ }
+ if (include_ref_map) {
+ if (struct_v > 1) {
+ used_in_blob.decode(p);
+ } else {
+ used_in_blob.clear();
+ bluestore_extent_ref_map_t legacy_ref_map;
+ legacy_ref_map.decode(p);
+ if (coll) {
+ for (auto r : legacy_ref_map.ref_map) {
+ get_ref(
+ coll,
+ r.first,
+ r.second.refs * r.second.length);
+ }
+ }
+ }
+ }
+}
+#endif
+
+// Extent
+
+void BlueStore::Extent::dump(Formatter* f) const
+{
+ f->dump_unsigned("logical_offset", logical_offset);
+ f->dump_unsigned("length", length);
+ f->dump_unsigned("blob_offset", blob_offset);
+ f->dump_object("blob", *blob);
+}
+
+ostream& operator<<(ostream& out, const BlueStore::Extent& e)
+{
+ return out << std::hex << "0x" << e.logical_offset << "~" << e.length
+ << ": 0x" << e.blob_offset << "~" << e.length << std::dec
+ << " " << *e.blob;
+}
+
+// OldExtent
+BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
+ uint32_t lo,
+ uint32_t o,
+ uint32_t l,
+ BlobRef& b) {
+ OldExtent* oe = new OldExtent(lo, o, l, b);
+ b->put_ref(c.get(), o, l, &(oe->r));
+ oe->blob_empty = !b->is_referenced();
+ return oe;
+}
+
+// ExtentMap
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
+#undef dout_context
+#define dout_context onode->c->store->cct
+
+BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size)
+ : onode(o),
+ inline_bl(inline_shard_prealloc_size) {
+}
+
+void BlueStore::ExtentMap::dump(Formatter* f) const
+{
+ f->open_array_section("extents");
+
+ for (auto& e : extent_map) {
+ f->dump_object("extent", e);
+ }
+ f->close_section();
+}
+
+void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
+ CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
+ uint64_t& length, uint64_t& dstoff) {
+
+ auto cct = onode->c->store->cct;
+ bool inject_21040 =
+ cct->_conf->bluestore_debug_inject_bug21040;
+ vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
+ for (auto& e : oldo->extent_map.extent_map) {
+ e.blob->last_encoded_id = -1;
+ }
+
+ int n = 0;
+ uint64_t end = srcoff + length;
+ uint32_t dirty_range_begin = 0;
+ uint32_t dirty_range_end = 0;
+ bool src_dirty = false;
+ for (auto ep = oldo->extent_map.seek_lextent(srcoff);
+ ep != oldo->extent_map.extent_map.end();
+ ++ep) {
+ auto& e = *ep;
+ if (e.logical_offset >= end) {
+ break;
+ }
+ dout(20) << __func__ << " src " << e << dendl;
+ BlobRef cb;
+ bool blob_duped = true;
+ if (e.blob->last_encoded_id >= 0) {
+ cb = id_to_blob[e.blob->last_encoded_id];
+ blob_duped = false;
+ } else {
+ // dup the blob
+ const bluestore_blob_t& blob = e.blob->get_blob();
+ // make sure it is shared
+ if (!blob.is_shared()) {
+ c->make_blob_shared(b->_assign_blobid(txc), e.blob);
+ if (!inject_21040 && !src_dirty) {
+ src_dirty = true;
+ dirty_range_begin = e.logical_offset;
+ } else if (inject_21040 &&
+ dirty_range_begin == 0 && dirty_range_end == 0) {
+ dirty_range_begin = e.logical_offset;
+ }
+ ceph_assert(e.logical_end() > 0);
+ // -1 to exclude next potential shard
+ dirty_range_end = e.logical_end() - 1;
+ } else {
+ c->load_shared_blob(e.blob->shared_blob);
+ }
+ cb = new Blob();
+ e.blob->last_encoded_id = n;
+ id_to_blob[n] = cb;
+ e.blob->dup(*cb);
+ // bump the extent refs on the copied blob's extents
+ for (auto p : blob.get_extents()) {
+ if (p.is_valid()) {
+ e.blob->shared_blob->get_ref(p.offset, p.length);
+ }
+ }
+ txc->write_shared_blob(e.blob->shared_blob);
+ dout(20) << __func__ << " new " << *cb << dendl;
+ }
+
+ int skip_front, skip_back;
+ if (e.logical_offset < srcoff) {
+ skip_front = srcoff - e.logical_offset;
+ } else {
+ skip_front = 0;
+ }
+ if (e.logical_end() > end) {
+ skip_back = e.logical_end() - end;
+ } else {
+ skip_back = 0;
+ }
+
+ Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
+ e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
+ newo->extent_map.extent_map.insert(*ne);
+ ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
+ // fixme: we may leave parts of new blob unreferenced that could
+ // be freed (relative to the shared_blob).
+ txc->statfs_delta.stored() += ne->length;
+ if (e.blob->get_blob().is_compressed()) {
+ txc->statfs_delta.compressed_original() += ne->length;
+ if (blob_duped) {
+ txc->statfs_delta.compressed() +=
+ cb->get_blob().get_compressed_payload_length();
+ }
+ }
+ dout(20) << __func__ << " dst " << *ne << dendl;
+ ++n;
+ }
+ if ((!inject_21040 && src_dirty) ||
+ (inject_21040 && dirty_range_end > dirty_range_begin)) {
+ oldo->extent_map.dirty_range(dirty_range_begin,
+ dirty_range_end - dirty_range_begin);
+ txc->write_onode(oldo);
+ }
+ txc->write_onode(newo);
+
+ if (dstoff + length > newo->onode.size) {
+ newo->onode.size = dstoff + length;
+ }
+ newo->extent_map.dirty_range(dstoff, length);
+}
+void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
+ bool force)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
+ if (onode->onode.extent_map_shards.empty()) {
+ if (inline_bl.length() == 0) {
+ unsigned n;
+ // we need to encode inline_bl to measure encoded length
+ bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+ inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
+ ceph_assert(!never_happen);
+ size_t len = inline_bl.length();
+ dout(20) << __func__ << " inline shard " << len << " bytes from " << n
+ << " extents" << dendl;
+ if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
+ request_reshard(0, OBJECT_MAX_SIZE);
+ return;
+ }
+ }
+ // will persist in the onode key.
+ } else {
+ // pending shard update
+ struct dirty_shard_t {
+ Shard *shard;
+ bufferlist bl;
+ dirty_shard_t(Shard *s) : shard(s) {}
+ };
+ vector<dirty_shard_t> encoded_shards;
+ // allocate slots for all shards in a single call instead of
+ // doing multiple allocations - one per each dirty shard
+ encoded_shards.reserve(shards.size());
+
+ auto p = shards.begin();
+ auto prev_p = p;
+ while (p != shards.end()) {
+ ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
+ auto n = p;
+ ++n;
+ if (p->dirty) {
+ uint32_t endoff;
+ if (n == shards.end()) {
+ endoff = OBJECT_MAX_SIZE;
+ } else {
+ endoff = n->shard_info->offset;
+ }
+ encoded_shards.emplace_back(dirty_shard_t(&(*p)));
+ bufferlist& bl = encoded_shards.back().bl;
+ if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
+ bl, &p->extents)) {
+ if (force) {
+ derr << __func__ << " encode_some needs reshard" << dendl;
+ ceph_assert(!force);
+ }
+ }
+ size_t len = bl.length();
+
+ dout(20) << __func__ << " shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << " is " << len
+ << " bytes (was " << p->shard_info->bytes << ") from "
+ << p->extents << " extents" << dendl;
+
+ if (!force) {
+ if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
+ // we are big; reshard ourselves
+ request_reshard(p->shard_info->offset, endoff);
+ }
+ // avoid resharding the trailing shard, even if it is small
+ else if (n != shards.end() &&
+ len < g_conf()->bluestore_extent_map_shard_min_size) {
+ ceph_assert(endoff != OBJECT_MAX_SIZE);
+ if (p == shards.begin()) {
+ // we are the first shard, combine with next shard
+ request_reshard(p->shard_info->offset, endoff + 1);
+ } else {
+ // combine either with the previous shard or the next,
+ // whichever is smaller
+ if (prev_p->shard_info->bytes > n->shard_info->bytes) {
+ request_reshard(p->shard_info->offset, endoff + 1);
+ } else {
+ request_reshard(prev_p->shard_info->offset, endoff);
+ }
+ }
+ }
+ }
+ }
+ prev_p = p;
+ p = n;
+ }
+ if (needs_reshard()) {
+ return;
+ }
+
+ // schedule DB update for dirty shards
+ string key;
+ for (auto& it : encoded_shards) {
+ dout(20) << __func__ << " encoding key for shard 0x" << std::hex
+ << it.shard->shard_info->offset << std::dec << dendl;
+ it.shard->dirty = false;
+ it.shard->shard_info->bytes = it.bl.length();
+ generate_extent_shard_key_and_apply(
+ onode->key,
+ it.shard->shard_info->offset,
+ &key,
+ [&](const string& final_key) {
+ t->set(PREFIX_OBJ, final_key, it.bl);
+ }
+ );
+ }
+ }
+}
+
+bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
+{
+ if (spanning_blob_map.empty())
+ return 0;
+ bid_t bid = spanning_blob_map.rbegin()->first + 1;
+ // bid is valid and available.
+ if (bid >= 0)
+ return bid;
+ // Find next unused bid;
+ bid = rand() % (numeric_limits<bid_t>::max() + 1);
+ const auto begin_bid = bid;
+ do {
+ if (!spanning_blob_map.count(bid))
+ return bid;
+ else {
+ bid++;
+ if (bid < 0) bid = 0;
+ }
+ } while (bid != begin_bid);
+ auto cct = onode->c->store->cct; // used by dout
+ _dump_onode<0>(cct, *onode);
+ ceph_abort_msg("no available blob id");
+}
+
+void BlueStore::ExtentMap::reshard(
+ KeyValueDB *db,
+ KeyValueDB::Transaction t)
+{
+ auto cct = onode->c->store->cct; // used by dout
+
+ dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
+ << needs_reshard_end << ")" << std::dec
+ << " of " << onode->onode.extent_map_shards.size()
+ << " shards on " << onode->oid << dendl;
+ for (auto& p : spanning_blob_map) {
+ dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
+ << dendl;
+ }
+ // determine shard index range
+ unsigned si_begin = 0, si_end = 0;
+ if (!shards.empty()) {
+ while (si_begin + 1 < shards.size() &&
+ shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
+ ++si_begin;
+ }
+ needs_reshard_begin = shards[si_begin].shard_info->offset;
+ for (si_end = si_begin; si_end < shards.size(); ++si_end) {
+ if (shards[si_end].shard_info->offset >= needs_reshard_end) {
+ needs_reshard_end = shards[si_end].shard_info->offset;
+ break;
+ }
+ }
+ if (si_end == shards.size()) {
+ needs_reshard_end = OBJECT_MAX_SIZE;
+ }
+ dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
+ << " over 0x[" << std::hex << needs_reshard_begin << ","
+ << needs_reshard_end << ")" << std::dec << dendl;
+ }
+
+ fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
+
+ // we may need to fault in a larger interval later must have all
+ // referring extents for spanning blobs loaded in order to have
+ // accurate use_tracker values.
+ uint32_t spanning_scan_begin = needs_reshard_begin;
+ uint32_t spanning_scan_end = needs_reshard_end;
+
+ // remove old keys
+ string key;
+ for (unsigned i = si_begin; i < si_end; ++i) {
+ generate_extent_shard_key_and_apply(
+ onode->key, shards[i].shard_info->offset, &key,
+ [&](const string& final_key) {
+ t->rmkey(PREFIX_OBJ, final_key);
+ }
+ );
+ }
+
+ // calculate average extent size
+ unsigned bytes = 0;
+ unsigned extents = 0;
+ if (onode->onode.extent_map_shards.empty()) {
+ bytes = inline_bl.length();
+ extents = extent_map.size();
+ } else {
+ for (unsigned i = si_begin; i < si_end; ++i) {
+ bytes += shards[i].shard_info->bytes;
+ extents += shards[i].extents;
+ }
+ }
+ unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
+ unsigned slop = target *
+ cct->_conf->bluestore_extent_map_shard_target_size_slop;
+ unsigned extent_avg = bytes / std::max(1u, extents);
+ dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
+ << ", slop " << slop << dendl;
+
+ // reshard
+ unsigned estimate = 0;
+ unsigned offset = needs_reshard_begin;
+ vector<bluestore_onode_t::shard_info> new_shard_info;
+ unsigned max_blob_end = 0;
+ Extent dummy(needs_reshard_begin);
+ for (auto e = extent_map.lower_bound(dummy);
+ e != extent_map.end();
+ ++e) {
+ if (e->logical_offset >= needs_reshard_end) {
+ break;
+ }
+ dout(30) << " extent " << *e << dendl;
+
+ // disfavor shard boundaries that span a blob
+ bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
+ if (estimate &&
+ estimate + extent_avg > target + (would_span ? slop : 0)) {
+ // new shard
+ if (offset == needs_reshard_begin) {
+ new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+ new_shard_info.back().offset = offset;
+ dout(20) << __func__ << " new shard 0x" << std::hex << offset
+ << std::dec << dendl;
+ }
+ offset = e->logical_offset;
+ new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+ new_shard_info.back().offset = offset;
+ dout(20) << __func__ << " new shard 0x" << std::hex << offset
+ << std::dec << dendl;
+ estimate = 0;
+ }
+ estimate += extent_avg;
+ unsigned bs = e->blob_start();
+ if (bs < spanning_scan_begin) {
+ spanning_scan_begin = bs;
+ }
+ uint32_t be = e->blob_end();
+ if (be > max_blob_end) {
+ max_blob_end = be;
+ }
+ if (be > spanning_scan_end) {
+ spanning_scan_end = be;
+ }
+ }
+ if (new_shard_info.empty() && (si_begin > 0 ||
+ si_end < shards.size())) {
+ // we resharded a partial range; we must produce at least one output
+ // shard
+ new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+ new_shard_info.back().offset = needs_reshard_begin;
+ dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
+ << std::dec << " (singleton degenerate case)" << dendl;
+ }
+
+ auto& sv = onode->onode.extent_map_shards;
+ dout(20) << __func__ << " new " << new_shard_info << dendl;
+ dout(20) << __func__ << " old " << sv << dendl;
+ if (sv.empty()) {
+ // no old shards to keep
+ sv.swap(new_shard_info);
+ init_shards(true, true);
+ } else {
+ // splice in new shards
+ sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
+ shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
+ sv.insert(
+ sv.begin() + si_begin,
+ new_shard_info.begin(),
+ new_shard_info.end());
+ shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
+ si_end = si_begin + new_shard_info.size();
+
+ ceph_assert(sv.size() == shards.size());
+
+ // note that we need to update every shard_info of shards here,
+ // as sv might have been totally re-allocated above
+ for (unsigned i = 0; i < shards.size(); i++) {
+ shards[i].shard_info = &sv[i];
+ }
+
+ // mark newly added shards as dirty
+ for (unsigned i = si_begin; i < si_end; ++i) {
+ shards[i].loaded = true;
+ shards[i].dirty = true;
+ }
+ }
+ dout(20) << __func__ << " fin " << sv << dendl;
+ inline_bl.clear();
+
+ if (sv.empty()) {
+ // no more shards; unspan all previously spanning blobs
+ auto p = spanning_blob_map.begin();
+ while (p != spanning_blob_map.end()) {
+ p->second->id = -1;
+ dout(30) << __func__ << " un-spanning " << *p->second << dendl;
+ p = spanning_blob_map.erase(p);
+ }
+ } else {
+ // identify new spanning blobs
+ dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
+ << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
+ if (spanning_scan_begin < needs_reshard_begin) {
+ fault_range(db, spanning_scan_begin,
+ needs_reshard_begin - spanning_scan_begin);
+ }
+ if (spanning_scan_end > needs_reshard_end) {
+ fault_range(db, needs_reshard_end,
+ spanning_scan_end - needs_reshard_end);
+ }
+ auto sp = sv.begin() + si_begin;
+ auto esp = sv.end();
+ unsigned shard_start = sp->offset;
+ unsigned shard_end;
+ ++sp;
+ if (sp == esp) {
+ shard_end = OBJECT_MAX_SIZE;
+ } else {
+ shard_end = sp->offset;
+ }
+ Extent dummy(needs_reshard_begin);
+
+ bool was_too_many_blobs_check = false;
+ auto too_many_blobs_threshold =
+ g_conf()->bluestore_debug_too_many_blobs_threshold;
+ auto& dumped_onodes = onode->c->onode_space.cache->dumped_onodes;
+ decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oid_slot = nullptr;
+ decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
+
+ for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
+ if (e->logical_offset >= needs_reshard_end) {
+ break;
+ }
+ dout(30) << " extent " << *e << dendl;
+ while (e->logical_offset >= shard_end) {
+ shard_start = shard_end;
+ ceph_assert(sp != esp);
+ ++sp;
+ if (sp == esp) {
+ shard_end = OBJECT_MAX_SIZE;
+ } else {
+ shard_end = sp->offset;
+ }
+ dout(30) << __func__ << " shard 0x" << std::hex << shard_start
+ << " to 0x" << shard_end << std::dec << dendl;
+ }
+
+ if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
+ if (!e->blob->is_spanning()) {
+ // We have two options: (1) split the blob into pieces at the
+ // shard boundaries (and adjust extents accordingly), or (2)
+ // mark it spanning. We prefer to cut the blob if we can. Note that
+ // we may have to split it multiple times--potentially at every
+ // shard boundary.
+ bool must_span = false;
+ BlobRef b = e->blob;
+ if (b->can_split()) {
+ uint32_t bstart = e->blob_start();
+ uint32_t bend = e->blob_end();
+ for (const auto& sh : shards) {
+ if (bstart < sh.shard_info->offset &&
+ bend > sh.shard_info->offset) {
+ uint32_t blob_offset = sh.shard_info->offset - bstart;
+ if (b->can_split_at(blob_offset)) {
+ dout(20) << __func__ << " splitting blob, bstart 0x"
+ << std::hex << bstart << " blob_offset 0x"
+ << blob_offset << std::dec << " " << *b << dendl;
+ b = split_blob(b, blob_offset, sh.shard_info->offset);
+ // switch b to the new right-hand side, in case it
+ // *also* has to get split.
+ bstart += blob_offset;
+ onode->c->store->logger->inc(l_bluestore_blob_split);
+ } else {
+ must_span = true;
+ break;
+ }
+ }
+ }
+ } else {
+ must_span = true;
+ }
+ if (must_span) {
+ auto bid = allocate_spanning_blob_id();
+ b->id = bid;
+ spanning_blob_map[b->id] = b;
+ dout(20) << __func__ << " adding spanning " << *b << dendl;
+ if (!was_too_many_blobs_check &&
+ too_many_blobs_threshold &&
+ spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
+
+ was_too_many_blobs_check = true;
+ for (size_t i = 0; i < dumped_onodes.size(); ++i) {
+ if (dumped_onodes[i].first == onode->oid) {
+ oid_slot = &dumped_onodes[i];
+ break;
+ }
+ if (!oldest_slot || (oldest_slot &&
+ dumped_onodes[i].second < oldest_slot->second)) {
+ oldest_slot = &dumped_onodes[i];
+ }
+ }
+ }
+ }
+ }
+ } else {
+ if (e->blob->is_spanning()) {
+ spanning_blob_map.erase(e->blob->id);
+ e->blob->id = -1;
+ dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
+ }
+ }
+ }
+ bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
+ (oid_slot &&
+ (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
+ if (do_dump) {
+ dout(0) << __func__
+ << " spanning blob count exceeds threshold, "
+ << spanning_blob_map.size() << " spanning blobs"
+ << dendl;
+ _dump_onode<0>(cct, *onode);
+ if (oid_slot) {
+ oid_slot->second = mono_clock::now();
+ } else {
+ ceph_assert(oldest_slot);
+ oldest_slot->first = onode->oid;
+ oldest_slot->second = mono_clock::now();
+ }
+ }
+ }
+
+ clear_needs_reshard();
+}
+
+bool BlueStore::ExtentMap::encode_some(
+ uint32_t offset,
+ uint32_t length,
+ bufferlist& bl,
+ unsigned *pn)
+{
+ Extent dummy(offset);
+ auto start = extent_map.lower_bound(dummy);
+ uint32_t end = offset + length;
+
+ __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+
+ unsigned n = 0;
+ size_t bound = 0;
+ bool must_reshard = false;
+ for (auto p = start;
+ p != extent_map.end() && p->logical_offset < end;
+ ++p, ++n) {
+ ceph_assert(p->logical_offset >= offset);
+ p->blob->last_encoded_id = -1;
+ if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
+ dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << " hit new spanning blob " << *p << dendl;
+ request_reshard(p->blob_start(), p->blob_end());
+ must_reshard = true;
+ }
+ if (!must_reshard) {
+ denc_varint(0, bound); // blobid
+ denc_varint(0, bound); // logical_offset
+ denc_varint(0, bound); // len
+ denc_varint(0, bound); // blob_offset
+
+ p->blob->bound_encode(
+ bound,
+ struct_v,
+ p->blob->shared_blob->get_sbid(),
+ false);
+ }
+ }
+ if (must_reshard) {
+ return true;
+ }
+
+ denc(struct_v, bound);
+ denc_varint(0, bound); // number of extents
+
+ {
+ auto app = bl.get_contiguous_appender(bound);
+ denc(struct_v, app);
+ denc_varint(n, app);
+ if (pn) {
+ *pn = n;
+ }
+
+ n = 0;
+ uint64_t pos = 0;
+ uint64_t prev_len = 0;
+ for (auto p = start;
+ p != extent_map.end() && p->logical_offset < end;
+ ++p, ++n) {
+ unsigned blobid;
+ bool include_blob = false;
+ if (p->blob->is_spanning()) {
+ blobid = p->blob->id << BLOBID_SHIFT_BITS;
+ blobid |= BLOBID_FLAG_SPANNING;
+ } else if (p->blob->last_encoded_id < 0) {
+ p->blob->last_encoded_id = n + 1; // so it is always non-zero
+ include_blob = true;
+ blobid = 0; // the decoder will infer the id from n
+ } else {
+ blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
+ }
+ if (p->logical_offset == pos) {
+ blobid |= BLOBID_FLAG_CONTIGUOUS;
+ }
+ if (p->blob_offset == 0) {
+ blobid |= BLOBID_FLAG_ZEROOFFSET;
+ }
+ if (p->length == prev_len) {
+ blobid |= BLOBID_FLAG_SAMELENGTH;
+ } else {
+ prev_len = p->length;
+ }
+ denc_varint(blobid, app);
+ if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+ denc_varint_lowz(p->logical_offset - pos, app);
+ }
+ if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+ denc_varint_lowz(p->blob_offset, app);
+ }
+ if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+ denc_varint_lowz(p->length, app);
+ }
+ pos = p->logical_end();
+ if (include_blob) {
+ p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
+ }
+ }
+ }
+ /*derr << __func__ << bl << dendl;
+ derr << __func__ << ":";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+ */
+ return false;
+}
+
+/////////////////// BlueStore::ExtentMap::DecoderExtent ///////////
+void BlueStore::ExtentMap::ExtentDecoder::decode_extent(
+ Extent* le,
+ __u8 struct_v,
+ bptr_c_it_t& p,
+ Collection* c)
+{
+ uint64_t blobid;
+ denc_varint(blobid, p);
+ if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+ uint64_t gap;
+ denc_varint_lowz(gap, p);
+ pos += gap;
+ }
+ le->logical_offset = pos;
+ if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+ denc_varint_lowz(le->blob_offset, p);
+ } else {
+ le->blob_offset = 0;
+ }
+ if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+ denc_varint_lowz(prev_len, p);
+ }
+ le->length = prev_len;
+ if (blobid & BLOBID_FLAG_SPANNING) {
+ consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS);
+ } else {
+ blobid >>= BLOBID_SHIFT_BITS;
+ if (blobid) {
+ consume_blobid(le, false, blobid - 1);
+ } else {
+ Blob *b = new Blob();
+ uint64_t sbid = 0;
+ b->decode(p, struct_v, &sbid, false, c);
+ consume_blob(le, extent_pos, sbid, b);
+ }
+ }
+ pos += prev_len;
+ ++extent_pos;
+}
+
+unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some(
+ const bufferlist& bl, Collection* c)
+{
+ __u8 struct_v;
+ uint32_t num;
+
+ ceph_assert(bl.get_num_buffers() <= 1);
+ auto p = bl.front().begin_deep();
+ denc(struct_v, p);
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level below.
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc_varint(num, p);
+
+ extent_pos = 0;
+ while (!p.end()) {
+ Extent* le = get_next_extent();
+ decode_extent(le, struct_v, p, c);
+ add_extent(le);
+ }
+ ceph_assert(extent_pos == num);
+ return num;
+}
+
+void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs(
+ bptr_c_it_t& p, Collection* c)
+{
+ __u8 struct_v;
+ denc(struct_v, p);
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ ceph_assert(struct_v == 1 || struct_v == 2);
+
+ unsigned n;
+ denc_varint(n, p);
+ while (n--) {
+ BlueStore::BlobRef b(new Blob());
+ denc_varint(b->id, p);
+ uint64_t sbid = 0;
+ b->decode(p, struct_v, &sbid, true, c);
+ consume_spanning_blob(sbid, b);
+ }
+}
+
+/////////////////// BlueStore::ExtentMap::DecoderExtentFull ///////////
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid(
+ BlueStore::Extent* le, bool spanning, uint64_t blobid) {
+ ceph_assert(le);
+ if (spanning) {
+ le->assign_blob(extent_map.get_spanning_blob(blobid));
+ } else {
+ ceph_assert(blobid < blobs.size());
+ le->assign_blob(blobs[blobid]);
+ // we build ref_map dynamically for non-spanning blobs
+ le->blob->get_ref(
+ extent_map.onode->c,
+ le->blob_offset,
+ le->length);
+ }
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob(
+ BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) {
+ ceph_assert(le);
+ blobs.resize(extent_no + 1);
+ blobs[extent_no] = b;
+ extent_map.onode->c->open_shared_blob(sbid, b);
+ le->assign_blob(b);
+ le->blob->get_ref(
+ extent_map.onode->c,
+ le->blob_offset,
+ le->length);
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob(
+ uint64_t sbid, BlueStore::BlobRef b) {
+ extent_map.spanning_blob_map[b->id] = b;
+ extent_map.onode->c->open_shared_blob(sbid, b);
+}
+
+BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent()
+{
+ return new Extent();
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le)
+{
+ extent_map.extent_map.insert(*le);
+}
+
+unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+{
+ ExtentDecoderFull edecoder(*this);
+ unsigned n = edecoder.decode_some(bl, onode->c);
+ return n;
+}
+
+void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
+{
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ __u8 struct_v = 2;
+
+ denc(struct_v, p);
+ denc_varint((uint32_t)0, p);
+ size_t key_size = 0;
+ denc_varint((uint32_t)0, key_size);
+ p += spanning_blob_map.size() * key_size;
+ for (const auto& i : spanning_blob_map) {
+ i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+ }
+}
+
+void BlueStore::ExtentMap::encode_spanning_blobs(
+ bufferlist::contiguous_appender& p)
+{
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ __u8 struct_v = 2;
+
+ denc(struct_v, p);
+ denc_varint(spanning_blob_map.size(), p);
+ for (auto& i : spanning_blob_map) {
+ denc_varint(i.second->id, p);
+ i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+ }
+}
+
+void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
+{
+ shards.resize(onode->onode.extent_map_shards.size());
+ unsigned i = 0;
+ for (auto &s : onode->onode.extent_map_shards) {
+ shards[i].shard_info = &s;
+ shards[i].loaded = loaded;
+ shards[i].dirty = dirty;
+ ++i;
+ }
+}
+
+void BlueStore::ExtentMap::fault_range(
+ KeyValueDB *db,
+ uint32_t offset,
+ uint32_t length)
+{
+ dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ auto start = seek_shard(offset);
+ auto last = seek_shard(offset + length);
+
+ if (start < 0)
+ return;
+
+ ceph_assert(last >= start);
+ string key;
+ while (start <= last) {
+ ceph_assert((size_t)start < shards.size());
+ auto p = &shards[start];
+ if (!p->loaded) {
+ dout(30) << __func__ << " opening shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << dendl;
+ bufferlist v;
+ generate_extent_shard_key_and_apply(
+ onode->key, p->shard_info->offset, &key,
+ [&](const string& final_key) {
+ int r = db->get(PREFIX_OBJ, final_key, &v);
+ if (r < 0) {
+ derr << __func__ << " missing shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << " for " << onode->oid
+ << dendl;
+ ceph_assert(r >= 0);
+ }
+ }
+ );
+ p->extents = decode_some(v);
+ p->loaded = true;
+ dout(20) << __func__ << " open shard 0x" << std::hex
+ << p->shard_info->offset
+ << " for range 0x" << offset << "~" << length << std::dec
+ << " (" << v.length() << " bytes)" << dendl;
+ ceph_assert(p->dirty == false);
+ ceph_assert(v.length() == p->shard_info->bytes);
+ onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
+ } else {
+ onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
+ }
+ ++start;
+ }
+}
+
+void BlueStore::ExtentMap::dirty_range(
+ uint32_t offset,
+ uint32_t length)
+{
+ dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ if (shards.empty()) {
+ dout(20) << __func__ << " mark inline shard dirty" << dendl;
+ inline_bl.clear();
+ return;
+ }
+ auto start = seek_shard(offset);
+ if (length == 0) {
+ length = 1;
+ }
+ auto last = seek_shard(offset + length - 1);
+ if (start < 0)
+ return;
+
+ ceph_assert(last >= start);
+ while (start <= last) {
+ ceph_assert((size_t)start < shards.size());
+ auto p = &shards[start];
+ if (!p->loaded) {
+ derr << __func__ << "on write 0x" << std::hex << offset
+ << "~" << length << " shard 0x" << p->shard_info->offset
+ << std::dec << " is not loaded, can't mark dirty" << dendl;
+ ceph_abort_msg("can't mark unloaded shard dirty");
+ }
+ if (!p->dirty) {
+ dout(20) << __func__ << " mark shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << " dirty" << dendl;
+ p->dirty = true;
+ }
+ ++start;
+ }
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
+ uint64_t offset)
+{
+ Extent dummy(offset);
+ return extent_map.find(dummy);
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
+ uint64_t offset)
+{
+ Extent dummy(offset);
+ auto fp = extent_map.lower_bound(dummy);
+ if (fp != extent_map.begin()) {
+ --fp;
+ if (fp->logical_end() <= offset) {
+ ++fp;
+ }
+ }
+ return fp;
+}
+
+BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
+ uint64_t offset) const
+{
+ Extent dummy(offset);
+ auto fp = extent_map.lower_bound(dummy);
+ if (fp != extent_map.begin()) {
+ --fp;
+ if (fp->logical_end() <= offset) {
+ ++fp;
+ }
+ }
+ return fp;
+}
+
+bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
+{
+ auto fp = seek_lextent(offset);
+ if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
+ return false;
+ }
+ return true;
+}
+
+int BlueStore::ExtentMap::compress_extent_map(
+ uint64_t offset,
+ uint64_t length)
+{
+ if (extent_map.empty())
+ return 0;
+ int removed = 0;
+ auto p = seek_lextent(offset);
+ if (p != extent_map.begin()) {
+ --p; // start to the left of offset
+ }
+ // the caller should have just written to this region
+ ceph_assert(p != extent_map.end());
+
+ // identify the *next* shard
+ auto pshard = shards.begin();
+ while (pshard != shards.end() &&
+ p->logical_offset >= pshard->shard_info->offset) {
+ ++pshard;
+ }
+ uint64_t shard_end;
+ if (pshard != shards.end()) {
+ shard_end = pshard->shard_info->offset;
+ } else {
+ shard_end = OBJECT_MAX_SIZE;
+ }
+
+ auto n = p;
+ for (++n; n != extent_map.end(); p = n++) {
+ if (n->logical_offset > offset + length) {
+ break; // stop after end
+ }
+ while (n != extent_map.end() &&
+ p->logical_end() == n->logical_offset &&
+ p->blob == n->blob &&
+ p->blob_offset + p->length == n->blob_offset &&
+ n->logical_offset < shard_end) {
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " next shard 0x" << shard_end << std::dec
+ << " merging " << *p << " and " << *n << dendl;
+ p->length += n->length;
+ rm(n++);
+ ++removed;
+ }
+ if (n == extent_map.end()) {
+ break;
+ }
+ if (n->logical_offset >= shard_end) {
+ ceph_assert(pshard != shards.end());
+ ++pshard;
+ if (pshard != shards.end()) {
+ shard_end = pshard->shard_info->offset;
+ } else {
+ shard_end = OBJECT_MAX_SIZE;
+ }
+ }
+ }
+ if (removed) {
+ onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
+ }
+ return removed;
+}
+
+void BlueStore::ExtentMap::punch_hole(
+ CollectionRef &c,
+ uint64_t offset,
+ uint64_t length,
+ old_extent_map_t *old_extents)
+{
+ auto p = seek_lextent(offset);
+ uint64_t end = offset + length;
+ while (p != extent_map.end()) {
+ if (p->logical_offset >= end) {
+ break;
+ }
+ if (p->logical_offset < offset) {
+ if (p->logical_end() > end) {
+ // split and deref middle
+ uint64_t front = offset - p->logical_offset;
+ OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
+ length, p->blob);
+ old_extents->push_back(*oe);
+ add(end,
+ p->blob_offset + front + length,
+ p->length - front - length,
+ p->blob);
+ p->length = front;
+ break;
+ } else {
+ // deref tail
+ ceph_assert(p->logical_end() > offset); // else seek_lextent bug
+ uint64_t keep = offset - p->logical_offset;
+ OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
+ p->length - keep, p->blob);
+ old_extents->push_back(*oe);
+ p->length = keep;
+ ++p;
+ continue;
+ }
+ }
+ if (p->logical_offset + p->length <= end) {
+ // deref whole lextent
+ OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+ p->length, p->blob);
+ old_extents->push_back(*oe);
+ rm(p++);
+ continue;
+ }
+ // deref head
+ uint64_t keep = p->logical_end() - end;
+ OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+ p->length - keep, p->blob);
+ old_extents->push_back(*oe);
+
+ add(end, p->blob_offset + p->length - keep, keep, p->blob);
+ rm(p);
+ break;
+ }
+}
+
+BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
+ CollectionRef &c,
+ uint64_t logical_offset,
+ uint64_t blob_offset, uint64_t length, BlobRef b,
+ old_extent_map_t *old_extents)
+{
+ // We need to have completely initialized Blob to increment its ref counters.
+ ceph_assert(b->get_blob().get_logical_length() != 0);
+
+ // Do get_ref prior to punch_hole to prevent from putting reused blob into
+ // old_extents list if we overwre the blob totally
+ // This might happen during WAL overwrite.
+ b->get_ref(onode->c, blob_offset, length);
+
+ if (old_extents) {
+ punch_hole(c, logical_offset, length, old_extents);
+ }
+
+ Extent *le = new Extent(logical_offset, blob_offset, length, b);
+ extent_map.insert(*le);
+ if (spans_shard(logical_offset, length)) {
+ request_reshard(logical_offset, logical_offset + length);
+ }
+ return le;
+}
+
+BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
+ BlobRef lb,
+ uint32_t blob_offset,
+ uint32_t pos)
+{
+ uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
+ dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
+ << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
+ << dendl;
+ BlobRef rb = onode->c->new_blob();
+ lb->split(onode->c, blob_offset, rb.get());
+
+ for (auto ep = seek_lextent(pos);
+ ep != extent_map.end() && ep->logical_offset < end_pos;
+ ++ep) {
+ if (ep->blob != lb) {
+ continue;
+ }
+ if (ep->logical_offset < pos) {
+ // split extent
+ size_t left = pos - ep->logical_offset;
+ Extent *ne = new Extent(pos, 0, ep->length - left, rb);
+ extent_map.insert(*ne);
+ ep->length = left;
+ dout(30) << __func__ << " split " << *ep << dendl;
+ dout(30) << __func__ << " to " << *ne << dendl;
+ } else {
+ // switch blob
+ ceph_assert(ep->blob_offset >= blob_offset);
+
+ ep->blob = rb;
+ ep->blob_offset -= blob_offset;
+ dout(30) << __func__ << " adjusted " << *ep << dendl;
+ }
+ }
+ return rb;
+}
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
+
+const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
+{
+ if (bluestore_onode_t::is_pgmeta_omap(flags)) {
+ return PREFIX_PGMETA_OMAP;
+ }
+ if (bluestore_onode_t::is_perpg_omap(flags)) {
+ return PREFIX_PERPG_OMAP;
+ }
+ if (bluestore_onode_t::is_perpool_omap(flags)) {
+ return PREFIX_PERPOOL_OMAP;
+ }
+ return PREFIX_OMAP;
+}
+
+// '-' < '.' < '~'
+void BlueStore::Onode::calc_omap_header(
+ uint8_t flags,
+ const Onode* o,
+ std::string* out)
+{
+ if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+ if (bluestore_onode_t::is_perpg_omap(flags)) {
+ _key_encode_u64(o->c->pool(), out);
+ _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+ } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+ _key_encode_u64(o->c->pool(), out);
+ }
+ }
+ _key_encode_u64(o->onode.nid, out);
+ out->push_back('-');
+}
+
+void BlueStore::Onode::calc_omap_key(uint8_t flags,
+ const Onode* o,
+ const std::string& key,
+ std::string* out)
+{
+ if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+ if (bluestore_onode_t::is_perpg_omap(flags)) {
+ _key_encode_u64(o->c->pool(), out);
+ _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+ } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+ _key_encode_u64(o->c->pool(), out);
+ }
+ }
+ _key_encode_u64(o->onode.nid, out);
+ out->push_back('.');
+ out->append(key);
+}
+
+void BlueStore::Onode::calc_omap_tail(
+ uint8_t flags,
+ const Onode* o,
+ std::string* out)
+{
+ if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+ if (bluestore_onode_t::is_perpg_omap(flags)) {
+ _key_encode_u64(o->c->pool(), out);
+ _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+ } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+ _key_encode_u64(o->c->pool(), out);
+ }
+ }
+ _key_encode_u64(o->onode.nid, out);
+ out->push_back('~');
+}
+
+void BlueStore::Onode::get()
+{
+ ++nref;
+ ++pin_nref;
+}
+void BlueStore::Onode::put()
+{
+ if (--pin_nref == 1) {
+ c->get_onode_cache()->maybe_unpin(this);
+ }
+ if (--nref == 0) {
+ delete this;
+ }
+}
+
+void BlueStore::Onode::decode_raw(
+ BlueStore::Onode* on,
+ const bufferlist& v,
+ BlueStore::ExtentMap::ExtentDecoder& edecoder)
+{
+ on->exists = true;
+ auto p = v.front().begin_deep();
+ on->onode.decode(p);
+
+ // initialize extent_map
+ edecoder.decode_spanning_blobs(p, on->c);
+ if (on->onode.extent_map_shards.empty()) {
+ denc(on->extent_map.inline_bl, p);
+ edecoder.decode_some(on->extent_map.inline_bl, on->c);
+ }
+}
+
+BlueStore::Onode* BlueStore::Onode::create_decode(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& v,
+ bool allow_empty)
+{
+ ceph_assert(v.length() || allow_empty);
+ Onode* on = new Onode(c.get(), oid, key);
+
+ if (v.length()) {
+ ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
+ decode_raw(on, v, edecoder);
+
+ for (auto& i : on->onode.attrs) {
+ i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+
+ // initialize extent_map
+ if (on->onode.extent_map_shards.empty()) {
+ on->extent_map.inline_bl.reassign_to_mempool(
+ mempool::mempool_bluestore_cache_data);
+ } else {
+ on->extent_map.init_shards(false, false);
+ }
+ }
+ return on;
+}
+
+void BlueStore::Onode::flush()
+{
+ if (flushing_count.load()) {
+ ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
+ waiting_count++;
+ std::unique_lock l(flush_lock);
+ while (flushing_count.load()) {
+ flush_cond.wait(l);
+ }
+ waiting_count--;
+ }
+ ldout(c->store->cct, 20) << __func__ << " done" << dendl;
+}
+
+void BlueStore::Onode::dump(Formatter* f) const
+{
+ onode.dump(f);
+ extent_map.dump(f);
+}
+
+void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
+{
+ if (!onode.is_pgmeta_omap()) {
+ if (onode.is_perpg_omap()) {
+ _key_encode_u64(c->pool(), out);
+ _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
+ } else if (onode.is_perpool_omap()) {
+ _key_encode_u64(c->pool(), out);
+ }
+ }
+ _key_encode_u64(onode.nid, out);
+ out->append(old.c_str() + out->length(), old.size() - out->length());
+}
+
+void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
+{
+ size_t pos = sizeof(uint64_t) + 1;
+ if (!onode.is_pgmeta_omap()) {
+ if (onode.is_perpg_omap()) {
+ pos += sizeof(uint64_t) + sizeof(uint32_t);
+ } else if (onode.is_perpool_omap()) {
+ pos += sizeof(uint64_t);
+ }
+ }
+ *user_key = key.substr(pos);
+}
+
+// =======================================================
+// WriteContext
+
+/// Checks for writes to the same pextent within a blob
+bool BlueStore::WriteContext::has_conflict(
+ BlobRef b,
+ uint64_t loffs,
+ uint64_t loffs_end,
+ uint64_t min_alloc_size)
+{
+ ceph_assert((loffs % min_alloc_size) == 0);
+ ceph_assert((loffs_end % min_alloc_size) == 0);
+ for (auto w : writes) {
+ if (b == w.b) {
+ auto loffs2 = p2align(w.logical_offset, min_alloc_size);
+ auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
+ if ((loffs <= loffs2 && loffs_end > loffs2) ||
+ (loffs >= loffs2 && loffs < loffs2_end)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// =======================================================
+
+// DeferredBatch
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
+#undef dout_context
+#define dout_context cct
+
+void BlueStore::DeferredBatch::prepare_write(
+ CephContext *cct,
+ uint64_t seq, uint64_t offset, uint64_t length,
+ bufferlist::const_iterator& blp)
+{
+ _discard(cct, offset, length);
+ auto i = iomap.insert(make_pair(offset, deferred_io()));
+ ceph_assert(i.second); // this should be a new insertion
+ i.first->second.seq = seq;
+ blp.copy(length, i.first->second.bl);
+ i.first->second.bl.reassign_to_mempool(
+ mempool::mempool_bluestore_writing_deferred);
+ dout(20) << __func__ << " seq " << seq
+ << " 0x" << std::hex << offset << "~" << length
+ << " crc " << i.first->second.bl.crc32c(-1)
+ << std::dec << dendl;
+ seq_bytes[seq] += length;
+#ifdef DEBUG_DEFERRED
+ _audit(cct);
+#endif
+}
+
+void BlueStore::DeferredBatch::_discard(
+ CephContext *cct, uint64_t offset, uint64_t length)
+{
+ generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ auto p = iomap.lower_bound(offset);
+ if (p != iomap.begin()) {
+ --p;
+ auto end = p->first + p->second.bl.length();
+ if (end > offset) {
+ bufferlist head;
+ head.substr_of(p->second.bl, 0, offset - p->first);
+ dout(20) << __func__ << " keep head " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << " -> 0x" << head.length() << std::dec << dendl;
+ auto i = seq_bytes.find(p->second.seq);
+ ceph_assert(i != seq_bytes.end());
+ if (end > offset + length) {
+ bufferlist tail;
+ tail.substr_of(p->second.bl, offset + length - p->first,
+ end - (offset + length));
+ dout(20) << __func__ << " keep tail " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << " -> 0x" << tail.length() << std::dec << dendl;
+ auto &n = iomap[offset + length];
+ n.bl.swap(tail);
+ n.seq = p->second.seq;
+ i->second -= length;
+ } else {
+ i->second -= end - offset;
+ }
+ ceph_assert(i->second >= 0);
+ p->second.bl.swap(head);
+ }
+ ++p;
+ }
+ while (p != iomap.end()) {
+ if (p->first >= offset + length) {
+ break;
+ }
+ auto i = seq_bytes.find(p->second.seq);
+ ceph_assert(i != seq_bytes.end());
+ auto end = p->first + p->second.bl.length();
+ if (end > offset + length) {
+ unsigned drop_front = offset + length - p->first;
+ unsigned keep_tail = end - (offset + length);
+ dout(20) << __func__ << " truncate front " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
+ << " to 0x" << (offset + length) << "~" << keep_tail
+ << std::dec << dendl;
+ auto &s = iomap[offset + length];
+ s.seq = p->second.seq;
+ s.bl.substr_of(p->second.bl, drop_front, keep_tail);
+ i->second -= drop_front;
+ } else {
+ dout(20) << __func__ << " drop " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << std::dec << dendl;
+ i->second -= p->second.bl.length();
+ }
+ ceph_assert(i->second >= 0);
+ p = iomap.erase(p);
+ }
+}
+
+void BlueStore::DeferredBatch::_audit(CephContext *cct)
+{
+ map<uint64_t,int> sb;
+ for (auto p : seq_bytes) {
+ sb[p.first] = 0; // make sure we have the same set of keys
+ }
+ uint64_t pos = 0;
+ for (auto& p : iomap) {
+ ceph_assert(p.first >= pos);
+ sb[p.second.seq] += p.second.bl.length();
+ pos = p.first + p.second.bl.length();
+ }
+ ceph_assert(sb == seq_bytes);
+}
+
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
+
+BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
+ : CollectionImpl(store_->cct, cid),
+ store(store_),
+ cache(bc),
+ exists(true),
+ onode_space(oc),
+ commit_queue(nullptr)
+{
+}
+
+bool BlueStore::Collection::flush_commit(Context *c)
+{
+ return osr->flush_commit(c);
+}
+
+void BlueStore::Collection::flush()
+{
+ osr->flush();
+}
+
+void BlueStore::Collection::flush_all_but_last()
+{
+ osr->flush_all_but_last();
+}
+
+void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
+{
+ ceph_assert(!b->shared_blob);
+ const bluestore_blob_t& blob = b->get_blob();
+ if (!blob.is_shared()) {
+ b->shared_blob = new SharedBlob(this);
+ return;
+ }
+
+ b->shared_blob = shared_blob_set.lookup(sbid);
+ if (b->shared_blob) {
+ ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " had " << *b->shared_blob << dendl;
+ } else {
+ b->shared_blob = new SharedBlob(sbid, this);
+ shared_blob_set.add(this, b->shared_blob.get());
+ ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " opened " << *b->shared_blob
+ << dendl;
+ }
+}
+
+void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
+{
+ if (!sb->is_loaded()) {
+
+ bufferlist v;
+ string key;
+ auto sbid = sb->get_sbid();
+ get_shared_blob_key(sbid, &key);
+ int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
+ if (r < 0) {
+ lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " not found at key "
+ << pretty_binary_string(key) << dendl;
+ ceph_abort_msg("uh oh, missing shared_blob");
+ }
+
+ sb->loaded = true;
+ sb->persistent = new bluestore_shared_blob_t(sbid);
+ auto p = v.cbegin();
+ decode(*(sb->persistent), p);
+ ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " loaded shared_blob " << *sb << dendl;
+ }
+}
+
+void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
+{
+ ldout(store->cct, 10) << __func__ << " " << *b << dendl;
+ ceph_assert(!b->shared_blob->is_loaded());
+
+ // update blob
+ bluestore_blob_t& blob = b->dirty_blob();
+ blob.set_flag(bluestore_blob_t::FLAG_SHARED);
+
+ // update shared blob
+ b->shared_blob->loaded = true;
+ b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
+ shared_blob_set.add(this, b->shared_blob.get());
+ for (auto p : blob.get_extents()) {
+ if (p.is_valid()) {
+ b->shared_blob->get_ref(
+ p.offset,
+ p.length);
+ }
+ }
+ ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
+}
+
+uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
+{
+ ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
+ ceph_assert(sb->is_loaded());
+
+ uint64_t sbid = sb->get_sbid();
+ shared_blob_set.remove(sb);
+ sb->loaded = false;
+ delete sb->persistent;
+ sb->sbid_unloaded = 0;
+ ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
+ return sbid;
+}
+
+BlueStore::OnodeRef BlueStore::Collection::get_onode(
+ const ghobject_t& oid,
+ bool create,
+ bool is_createop)
+{
+ ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
+
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ if (!oid.match(cnode.bits, pgid.ps())) {
+ lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+ << pgid << " bits " << cnode.bits << dendl;
+ ceph_abort();
+ }
+ }
+
+ OnodeRef o = onode_space.lookup(oid);
+ if (o)
+ return o;
+
+ string key;
+ get_object_key(store->cct, oid, &key);
+
+ ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+ << pretty_binary_string(key) << dendl;
+
+ bufferlist v;
+ int r = -ENOENT;
+ Onode *on;
+ if (!is_createop) {
+ r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
+ ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+ }
+ if (v.length() == 0) {
+ ceph_assert(r == -ENOENT);
+ if (!create)
+ return OnodeRef();
+ } else {
+ ceph_assert(r >= 0);
+ }
+
+ // new object, load onode if available
+ on = Onode::create_decode(this, oid, key, v, true);
+ o.reset(on);
+ return onode_space.add_onode(oid, o);
+}
+
+void BlueStore::Collection::split_cache(
+ Collection *dest)
+{
+ ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
+
+ auto *ocache = get_onode_cache();
+ auto *ocache_dest = dest->get_onode_cache();
+
+ // lock cache shards
+ std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
+ std::lock_guard l(ocache->lock, std::adopt_lock);
+ std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
+ std::lock_guard l3(cache->lock, std::adopt_lock);
+ std::lock_guard l4(dest->cache->lock, std::adopt_lock);
+
+ int destbits = dest->cnode.bits;
+ spg_t destpg;
+ bool is_pg = dest->cid.is_pg(&destpg);
+ ceph_assert(is_pg);
+
+ auto p = onode_space.onode_map.begin();
+ while (p != onode_space.onode_map.end()) {
+ OnodeRef o = p->second;
+ if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
+ // onode does not belong to this child
+ ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
+ << dendl;
+ ++p;
+ } else {
+ ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
+ << dendl;
+
+ // ensuring that nref is always >= 2 and hence onode is pinned
+ OnodeRef o_pin = o;
+
+ p = onode_space.onode_map.erase(p);
+ dest->onode_space.onode_map[o->oid] = o;
+ if (o->cached) {
+ get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get());
+ }
+ o->c = dest;
+
+ // move over shared blobs and buffers. cover shared blobs from
+ // both extent map and spanning blob map (the full extent map
+ // may not be faulted in)
+ vector<SharedBlob*> sbvec;
+ for (auto& e : o->extent_map.extent_map) {
+ sbvec.push_back(e.blob->shared_blob.get());
+ }
+ for (auto& b : o->extent_map.spanning_blob_map) {
+ sbvec.push_back(b.second->shared_blob.get());
+ }
+ for (auto sb : sbvec) {
+ if (sb->coll == dest) {
+ ldout(store->cct, 20) << __func__ << " already moved " << *sb
+ << dendl;
+ continue;
+ }
+ ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
+ if (sb->get_sbid()) {
+ ldout(store->cct, 20) << __func__
+ << " moving registration " << *sb << dendl;
+ shared_blob_set.remove(sb);
+ dest->shared_blob_set.add(dest, sb);
+ }
+ sb->coll = dest;
+ if (dest->cache != cache) {
+ for (auto& i : sb->bc.buffer_map) {
+ if (!i.second->is_writing()) {
+ ldout(store->cct, 20) << __func__ << " moving " << *i.second
+ << dendl;
+ dest->cache->_move(cache, i.second.get());
+ }
+ }
+ }
+ }
+ }
+ }
+ dest->cache->_trim();
+}
+
+// =======================================================
+
+// MempoolThread
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
+#undef dout_context
+#define dout_context store->cct
+
+void *BlueStore::MempoolThread::entry()
+{
+ std::unique_lock l{lock};
+
+ uint32_t prev_config_change = store->config_changed.load();
+ uint64_t base = store->osd_memory_base;
+ double fragmentation = store->osd_memory_expected_fragmentation;
+ uint64_t target = store->osd_memory_target;
+ uint64_t min = store->osd_memory_cache_min;
+ uint64_t max = min;
+
+ // When setting the maximum amount of memory to use for cache, first
+ // assume some base amount of memory for the OSD and then fudge in
+ // some overhead for fragmentation that scales with cache usage.
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ binned_kv_cache = store->db->get_priority_cache();
+ binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
+ if (store->cache_autotune && binned_kv_cache != nullptr) {
+ pcm = std::make_shared<PriorityCache::Manager>(
+ store->cct, min, max, target, true, "bluestore-pricache");
+ pcm->insert("kv", binned_kv_cache, true);
+ pcm->insert("meta", meta_cache, true);
+ pcm->insert("data", data_cache, true);
+ if (binned_kv_onode_cache != nullptr) {
+ pcm->insert("kv_onode", binned_kv_onode_cache, true);
+ }
+ }
+
+ utime_t next_balance = ceph_clock_now();
+ utime_t next_resize = ceph_clock_now();
+ utime_t next_bin_rotation = ceph_clock_now();
+ utime_t next_deferred_force_submit = ceph_clock_now();
+ utime_t alloc_stats_dump_clock = ceph_clock_now();
+
+ bool interval_stats_trim = false;
+ while (!stop) {
+ // Update pcm cache settings if related configuration was changed
+ uint32_t cur_config_change = store->config_changed.load();
+ if (cur_config_change != prev_config_change) {
+ _update_cache_settings();
+ prev_config_change = cur_config_change;
+ }
+
+ // define various intervals for background work
+ double age_bin_interval = store->cache_age_bin_interval;
+ double autotune_interval = store->cache_autotune_interval;
+ double resize_interval = store->osd_memory_cache_resize_interval;
+ double max_defer_interval = store->max_defer_interval;
+ double alloc_stats_dump_interval =
+ store->cct->_conf->bluestore_alloc_stats_dump_interval;
+
+ // alloc stats dump
+ if (alloc_stats_dump_interval > 0 &&
+ alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
+ store->_record_allocation_stats();
+ alloc_stats_dump_clock = ceph_clock_now();
+ }
+ // cache age binning
+ if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
+ if (binned_kv_cache != nullptr) {
+ binned_kv_cache->import_bins(store->kv_bins);
+ }
+ if (binned_kv_onode_cache != nullptr) {
+ binned_kv_onode_cache->import_bins(store->kv_onode_bins);
+ }
+ meta_cache->import_bins(store->meta_bins);
+ data_cache->import_bins(store->data_bins);
+
+ if (pcm != nullptr) {
+ pcm->shift_bins();
+ }
+ next_bin_rotation = ceph_clock_now();
+ next_bin_rotation += age_bin_interval;
+ }
+ // cache balancing
+ if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+ if (binned_kv_cache != nullptr) {
+ binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+ }
+ if (binned_kv_onode_cache != nullptr) {
+ binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+ }
+ meta_cache->set_cache_ratio(store->cache_meta_ratio);
+ data_cache->set_cache_ratio(store->cache_data_ratio);
+
+ // Log events at 5 instead of 20 when balance happens.
+ interval_stats_trim = true;
+
+ if (pcm != nullptr) {
+ pcm->balance();
+ }
+
+ next_balance = ceph_clock_now();
+ next_balance += autotune_interval;
+ }
+ // memory resizing (ie autotuning)
+ if (resize_interval > 0 && next_resize < ceph_clock_now()) {
+ if (ceph_using_tcmalloc() && pcm != nullptr) {
+ pcm->tune_memory();
+ }
+ next_resize = ceph_clock_now();
+ next_resize += resize_interval;
+ }
+ // deferred force submit
+ if (max_defer_interval > 0 &&
+ next_deferred_force_submit < ceph_clock_now()) {
+ if (store->get_deferred_last_submitted() + max_defer_interval <
+ ceph_clock_now()) {
+ store->deferred_try_submit();
+ }
+ next_deferred_force_submit = ceph_clock_now();
+ next_deferred_force_submit += max_defer_interval/3;
+ }
+
+ // Now Resize the shards
+ _resize_shards(interval_stats_trim);
+ interval_stats_trim = false;
+
+ store->_update_logger();
+ auto wait = ceph::make_timespan(
+ store->cct->_conf->bluestore_cache_trim_interval);
+ cond.wait_for(l, wait);
+ }
+ // do final dump
+ store->_record_allocation_stats();
+ stop = false;
+ pcm = nullptr;
+ return NULL;
+}
+
+void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
+{
+ size_t onode_shards = store->onode_cache_shards.size();
+ size_t buffer_shards = store->buffer_cache_shards.size();
+ int64_t kv_used = store->db->get_cache_usage();
+ int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
+ int64_t meta_used = meta_cache->_get_used_bytes();
+ int64_t data_used = data_cache->_get_used_bytes();
+
+ uint64_t cache_size = store->cache_size;
+ int64_t kv_alloc =
+ static_cast<int64_t>(store->cache_kv_ratio * cache_size);
+ int64_t kv_onode_alloc =
+ static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
+ int64_t meta_alloc =
+ static_cast<int64_t>(store->cache_meta_ratio * cache_size);
+ int64_t data_alloc =
+ static_cast<int64_t>(store->cache_data_ratio * cache_size);
+
+ if (pcm != nullptr && binned_kv_cache != nullptr) {
+ cache_size = pcm->get_tuned_mem();
+ kv_alloc = binned_kv_cache->get_committed_size();
+ meta_alloc = meta_cache->get_committed_size();
+ data_alloc = data_cache->get_committed_size();
+ if (binned_kv_onode_cache != nullptr) {
+ kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
+ }
+ }
+
+ if (interval_stats) {
+ dout(5) << __func__ << " cache_size: " << cache_size
+ << " kv_alloc: " << kv_alloc
+ << " kv_used: " << kv_used
+ << " kv_onode_alloc: " << kv_onode_alloc
+ << " kv_onode_used: " << kv_onode_used
+ << " meta_alloc: " << meta_alloc
+ << " meta_used: " << meta_used
+ << " data_alloc: " << data_alloc
+ << " data_used: " << data_used << dendl;
+ } else {
+ dout(20) << __func__ << " cache_size: " << cache_size
+ << " kv_alloc: " << kv_alloc
+ << " kv_used: " << kv_used
+ << " kv_onode_alloc: " << kv_onode_alloc
+ << " kv_onode_used: " << kv_onode_used
+ << " meta_alloc: " << meta_alloc
+ << " meta_used: " << meta_used
+ << " data_alloc: " << data_alloc
+ << " data_used: " << data_used << dendl;
+ }
+
+ uint64_t max_shard_onodes = static_cast<uint64_t>(
+ (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
+ uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
+
+ dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
+ << " max_shard_buffer: " << max_shard_buffer << dendl;
+
+ for (auto i : store->onode_cache_shards) {
+ i->set_max(max_shard_onodes);
+ }
+ for (auto i : store->buffer_cache_shards) {
+ i->set_max(max_shard_buffer);
+ }
+}
+
+void BlueStore::MempoolThread::_update_cache_settings()
+{
+ // Nothing to do if pcm is not used.
+ if (pcm == nullptr) {
+ return;
+ }
+
+ uint64_t target = store->osd_memory_target;
+ uint64_t base = store->osd_memory_base;
+ uint64_t min = store->osd_memory_cache_min;
+ uint64_t max = min;
+ double fragmentation = store->osd_memory_expected_fragmentation;
+
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ // set pcm cache levels
+ pcm->set_target_memory(target);
+ pcm->set_min_memory(min);
+ pcm->set_max_memory(max);
+
+ dout(5) << __func__ << " updated pcm target: " << target
+ << " pcm min: " << min
+ << " pcm max: " << max
+ << dendl;
+}
+
+// =======================================================
+
+// OmapIteratorImpl
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
+
+BlueStore::OmapIteratorImpl::OmapIteratorImpl(
+ PerfCounters* _logger, CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it)
+ : logger(_logger), c(c), o(o), it(it)
+{
+ logger->inc(l_bluestore_omap_iterator_count);
+ std::shared_lock l(c->lock);
+ if (o->onode.has_omap()) {
+ o->get_omap_key(string(), &head);
+ o->get_omap_tail(&tail);
+ it->lower_bound(head);
+ }
+}
+BlueStore::OmapIteratorImpl::~OmapIteratorImpl()
+{
+ logger->dec(l_bluestore_omap_iterator_count);
+}
+
+string BlueStore::OmapIteratorImpl::_stringify() const
+{
+ stringstream s;
+ s << " omap_iterator(cid = " << c->cid
+ <<", oid = " << o->oid << ")";
+ return s.str();
+}
+
+int BlueStore::OmapIteratorImpl::seek_to_first()
+{
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ it->lower_bound(head);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_seek_to_first_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ return 0;
+}
+
+int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ string key;
+ o->get_omap_key(after, &key);
+ ldout(c->store->cct,20) << __func__ << " after " << after << " key "
+ << pretty_binary_string(key) << dendl;
+ it->upper_bound(key);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ c->store->log_latency_fn(
+ __func__,
+ l_bluestore_omap_upper_bound_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age,
+ [&] (const ceph::timespan& lat) {
+ return ", after = " + after +
+ _stringify();
+ }
+ );
+ return 0;
+}
+
+int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ string key;
+ o->get_omap_key(to, &key);
+ ldout(c->store->cct,20) << __func__ << " to " << to << " key "
+ << pretty_binary_string(key) << dendl;
+ it->lower_bound(key);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ c->store->log_latency_fn(
+ __func__,
+ l_bluestore_omap_lower_bound_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age,
+ [&] (const ceph::timespan& lat) {
+ return ", to = " + to +
+ _stringify();
+ }
+ );
+ return 0;
+}
+
+bool BlueStore::OmapIteratorImpl::valid()
+{
+ std::shared_lock l(c->lock);
+ bool r = o->onode.has_omap() && it && it->valid() &&
+ it->raw_key().second < tail;
+ if (it && it->valid()) {
+ ldout(c->store->cct,20) << __func__ << " is at "
+ << pretty_binary_string(it->raw_key().second)
+ << dendl;
+ }
+ return r;
+}
+
+int BlueStore::OmapIteratorImpl::next()
+{
+ int r = -1;
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ it->next();
+ r = 0;
+ }
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_next_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ return r;
+}
+
+string BlueStore::OmapIteratorImpl::key()
+{
+ std::shared_lock l(c->lock);
+ ceph_assert(it->valid());
+ string db_key = it->raw_key().second;
+ string user_key;
+ o->decode_omap_key(db_key, &user_key);
+
+ return user_key;
+}
+
+bufferlist BlueStore::OmapIteratorImpl::value()
+{
+ std::shared_lock l(c->lock);
+ ceph_assert(it->valid());
+ return it->value();
+}
+
+
+// =====================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << path << ") "
+#undef dout_context
+#define dout_context cct
+
+
+static void aio_cb(void *priv, void *priv2)
+{
+ BlueStore *store = static_cast<BlueStore*>(priv);
+ BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
+ c->aio_finish(store);
+}
+
+static void discard_cb(void *priv, void *priv2)
+{
+ BlueStore *store = static_cast<BlueStore*>(priv);
+ interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+ store->handle_discard(*tmp);
+}
+
+void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(alloc);
+ alloc->release(to_release);
+}
+
+BlueStore::BlueStore(CephContext *cct, const string& path)
+ : BlueStore(cct, path, 0) {}
+
+BlueStore::BlueStore(CephContext *cct,
+ const string& path,
+ uint64_t _min_alloc_size)
+ : ObjectStore(cct, path),
+ throttle(cct),
+ finisher(cct, "commit_finisher", "cfin"),
+ kv_sync_thread(this),
+ kv_finalize_thread(this),
+#ifdef HAVE_LIBZBD
+ zoned_cleaner_thread(this),
+#endif
+ min_alloc_size(_min_alloc_size),
+ min_alloc_size_order(std::countr_zero(_min_alloc_size)),
+ mempool_thread(this)
+{
+ _init_logger();
+ cct->_conf.add_observer(this);
+ set_cache_shards(1);
+}
+
+BlueStore::~BlueStore()
+{
+ cct->_conf.remove_observer(this);
+ _shutdown_logger();
+ ceph_assert(!mounted);
+ ceph_assert(db == NULL);
+ ceph_assert(bluefs == NULL);
+ ceph_assert(fsid_fd < 0);
+ ceph_assert(path_fd < 0);
+ for (auto i : onode_cache_shards) {
+ delete i;
+ }
+ for (auto i : buffer_cache_shards) {
+ delete i;
+ }
+ onode_cache_shards.clear();
+ buffer_cache_shards.clear();
+}
+
+const char **BlueStore::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "bluestore_csum_type",
+ "bluestore_compression_mode",
+ "bluestore_compression_algorithm",
+ "bluestore_compression_min_blob_size",
+ "bluestore_compression_min_blob_size_ssd",
+ "bluestore_compression_min_blob_size_hdd",
+ "bluestore_compression_max_blob_size",
+ "bluestore_compression_max_blob_size_ssd",
+ "bluestore_compression_max_blob_size_hdd",
+ "bluestore_compression_required_ratio",
+ "bluestore_max_alloc_size",
+ "bluestore_prefer_deferred_size",
+ "bluestore_prefer_deferred_size_hdd",
+ "bluestore_prefer_deferred_size_ssd",
+ "bluestore_deferred_batch_ops",
+ "bluestore_deferred_batch_ops_hdd",
+ "bluestore_deferred_batch_ops_ssd",
+ "bluestore_throttle_bytes",
+ "bluestore_throttle_deferred_bytes",
+ "bluestore_throttle_cost_per_io_hdd",
+ "bluestore_throttle_cost_per_io_ssd",
+ "bluestore_throttle_cost_per_io",
+ "bluestore_max_blob_size",
+ "bluestore_max_blob_size_ssd",
+ "bluestore_max_blob_size_hdd",
+ "osd_memory_target",
+ "osd_memory_target_cgroup_limit_ratio",
+ "osd_memory_base",
+ "osd_memory_cache_min",
+ "osd_memory_expected_fragmentation",
+ "bluestore_cache_autotune",
+ "bluestore_cache_autotune_interval",
+ "bluestore_cache_age_bin_interval",
+ "bluestore_cache_kv_age_bins",
+ "bluestore_cache_kv_onode_age_bins",
+ "bluestore_cache_meta_age_bins",
+ "bluestore_cache_data_age_bins",
+ "bluestore_warn_on_legacy_statfs",
+ "bluestore_warn_on_no_per_pool_omap",
+ "bluestore_warn_on_no_per_pg_omap",
+ "bluestore_max_defer_interval",
+ NULL
+ };
+ return KEYS;
+}
+
+void BlueStore::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ if (changed.count("bluestore_warn_on_legacy_statfs")) {
+ _check_legacy_statfs_alert();
+ }
+ if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
+ changed.count("bluestore_warn_on_no_per_pg_omap")) {
+ _check_no_per_pg_or_pool_omap_alert();
+ }
+
+ if (changed.count("bluestore_csum_type")) {
+ _set_csum();
+ }
+ if (changed.count("bluestore_compression_mode") ||
+ changed.count("bluestore_compression_algorithm") ||
+ changed.count("bluestore_compression_min_blob_size") ||
+ changed.count("bluestore_compression_max_blob_size")) {
+ if (bdev) {
+ _set_compression();
+ }
+ }
+ if (changed.count("bluestore_max_blob_size") ||
+ changed.count("bluestore_max_blob_size_ssd") ||
+ changed.count("bluestore_max_blob_size_hdd")) {
+ if (bdev) {
+ // only after startup
+ _set_blob_size();
+ }
+ }
+ if (changed.count("bluestore_prefer_deferred_size") ||
+ changed.count("bluestore_prefer_deferred_size_hdd") ||
+ changed.count("bluestore_prefer_deferred_size_ssd") ||
+ changed.count("bluestore_max_alloc_size") ||
+ changed.count("bluestore_deferred_batch_ops") ||
+ changed.count("bluestore_deferred_batch_ops_hdd") ||
+ changed.count("bluestore_deferred_batch_ops_ssd")) {
+ if (bdev) {
+ // only after startup
+ _set_alloc_sizes();
+ }
+ }
+ if (changed.count("bluestore_throttle_cost_per_io") ||
+ changed.count("bluestore_throttle_cost_per_io_hdd") ||
+ changed.count("bluestore_throttle_cost_per_io_ssd")) {
+ if (bdev) {
+ _set_throttle_params();
+ }
+ }
+ if (changed.count("bluestore_throttle_bytes") ||
+ changed.count("bluestore_throttle_deferred_bytes") ||
+ changed.count("bluestore_throttle_trace_rate")) {
+ throttle.reset_throttle(conf);
+ }
+ if (changed.count("bluestore_max_defer_interval")) {
+ if (bdev) {
+ _set_max_defer_interval();
+ }
+ }
+ if (changed.count("osd_memory_target") ||
+ changed.count("osd_memory_base") ||
+ changed.count("osd_memory_cache_min") ||
+ changed.count("osd_memory_expected_fragmentation")) {
+ _update_osd_memory_options();
+ }
+}
+
+void BlueStore::_set_compression()
+{
+ auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
+ if (m) {
+ _clear_compression_alert();
+ comp_mode = *m;
+ } else {
+ derr << __func__ << " unrecognized value '"
+ << cct->_conf->bluestore_compression_mode
+ << "' for bluestore_compression_mode, reverting to 'none'"
+ << dendl;
+ comp_mode = Compressor::COMP_NONE;
+ string s("unknown mode: ");
+ s += cct->_conf->bluestore_compression_mode;
+ _set_compression_alert(true, s.c_str());
+ }
+
+ compressor = nullptr;
+
+ if (cct->_conf->bluestore_compression_min_blob_size) {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
+ } else {
+ ceph_assert(bdev);
+ if (_use_rotational_settings()) {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
+ } else {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
+ }
+ }
+
+ if (cct->_conf->bluestore_compression_max_blob_size) {
+ comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+ } else {
+ ceph_assert(bdev);
+ if (_use_rotational_settings()) {
+ comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
+ } else {
+ comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
+ }
+ }
+
+ auto& alg_name = cct->_conf->bluestore_compression_algorithm;
+ if (!alg_name.empty()) {
+ compressor = Compressor::create(cct, alg_name);
+ if (!compressor) {
+ derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
+ << dendl;
+ _set_compression_alert(false, alg_name.c_str());
+ }
+ }
+
+ dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
+ << " alg " << (compressor ? compressor->get_type_name() : "(none)")
+ << " min_blob " << comp_min_blob_size
+ << " max_blob " << comp_max_blob_size
+ << dendl;
+}
+
+void BlueStore::_set_csum()
+{
+ csum_type = Checksummer::CSUM_NONE;
+ int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
+ if (t > Checksummer::CSUM_NONE)
+ csum_type = t;
+
+ dout(10) << __func__ << " csum_type "
+ << Checksummer::get_csum_type_string(csum_type)
+ << dendl;
+}
+
+void BlueStore::_set_throttle_params()
+{
+ if (cct->_conf->bluestore_throttle_cost_per_io) {
+ throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
+ } else {
+ ceph_assert(bdev);
+ if (_use_rotational_settings()) {
+ throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
+ } else {
+ throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
+ }
+ }
+
+ dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
+ << dendl;
+}
+void BlueStore::_set_blob_size()
+{
+ if (cct->_conf->bluestore_max_blob_size) {
+ max_blob_size = cct->_conf->bluestore_max_blob_size;
+ } else {
+ ceph_assert(bdev);
+ if (_use_rotational_settings()) {
+ max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
+ } else {
+ max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
+ }
+ }
+ dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
+ << std::dec << dendl;
+}
+
+void BlueStore::_update_osd_memory_options()
+{
+ osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+ osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+ osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+ config_changed++;
+ dout(10) << __func__
+ << " osd_memory_target " << osd_memory_target
+ << " osd_memory_base " << osd_memory_base
+ << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
+ << " osd_memory_cache_min " << osd_memory_cache_min
+ << dendl;
+}
+
+int BlueStore::_set_cache_sizes()
+{
+ ceph_assert(bdev);
+ cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
+ cache_autotune_interval =
+ cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+ cache_age_bin_interval =
+ cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
+ auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
+ {
+ std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
+ std::istringstream interval_stream(intervals_str);
+ std::copy(
+ std::istream_iterator<uint64_t>(interval_stream),
+ std::istream_iterator<uint64_t>(),
+ std::back_inserter(*intervals));
+ };
+ _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
+ _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
+ _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
+ _set_bin("bluestore_cache_age_bins_data", &data_bins);
+
+ osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+ osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+ osd_memory_expected_fragmentation =
+ cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+ osd_memory_cache_resize_interval =
+ cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
+
+ if (cct->_conf->bluestore_cache_size) {
+ cache_size = cct->_conf->bluestore_cache_size;
+ } else {
+ // choose global cache size based on backend type
+ if (_use_rotational_settings()) {
+ cache_size = cct->_conf->bluestore_cache_size_hdd;
+ } else {
+ cache_size = cct->_conf->bluestore_cache_size_ssd;
+ }
+ }
+
+ cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
+ if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+ << ") must be in range [0,1.0]" << dendl;
+ return -EINVAL;
+ }
+
+ cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
+ if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
+ << ") must be in range [0,1.0]" << dendl;
+ return -EINVAL;
+ }
+
+ cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
+ if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
+ << ") must be in range [0,1.0]" << dendl;
+ return -EINVAL;
+ }
+
+ if (cache_meta_ratio + cache_kv_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+ << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
+ << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
+ << dendl;
+ return -EINVAL;
+ }
+
+ cache_data_ratio = (double)1.0 -
+ (double)cache_meta_ratio -
+ (double)cache_kv_ratio -
+ (double)cache_kv_onode_ratio;
+ if (cache_data_ratio < 0) {
+ // deal with floating point imprecision
+ cache_data_ratio = 0;
+ }
+
+ dout(1) << __func__ << " cache_size " << cache_size
+ << " meta " << cache_meta_ratio
+ << " kv " << cache_kv_ratio
+ << " data " << cache_data_ratio
+ << dendl;
+ return 0;
+}
+
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::write_meta(key, value);
+ }
+ label.meta[key] = value;
+ r = _write_bdev_label(cct, p, label);
+ ceph_assert(r == 0);
+ return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::read_meta(key, value);
+ }
+ auto i = label.meta.find(key);
+ if (i == label.meta.end()) {
+ return ObjectStore::read_meta(key, value);
+ }
+ *value = i->second;
+ return 0;
+}
+
+void BlueStore::_init_logger()
+{
+ PerfCountersBuilder b(cct, "bluestore",
+ l_bluestore_first, l_bluestore_last);
+
+ // space utilization stats
+ //****************************************
+ b.add_u64(l_bluestore_allocated, "allocated",
+ "Sum for allocated bytes",
+ "al_b",
+ PerfCountersBuilder::PRIO_CRITICAL,
+ unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_stored, "stored",
+ "Sum for stored bytes",
+ "st_b",
+ PerfCountersBuilder::PRIO_CRITICAL,
+ unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
+ "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
+ b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
+ "allocation unit size in bytes",
+ "au_b",
+ PerfCountersBuilder::PRIO_CRITICAL,
+ unit_t(UNIT_BYTES));
+ //****************************************
+
+ // Update op processing state latencies
+ //****************************************
+ b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
+ "Average prepare state latency",
+ "sprl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
+ "Average aio_wait state latency",
+ "sawl", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
+ "Average io_done state latency",
+ "sidl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
+ "Average kv_queued state latency",
+ "skql", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
+ "Average kv_commiting state latency",
+ "skcl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
+ "Average kv_done state latency",
+ "skdl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
+ "Average finishing state latency",
+ "sfnl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
+ "Average done state latency",
+ "sdnl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
+ "Average deferred_queued state latency",
+ "sdql", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
+ "Average aio_wait state latency",
+ "sdal", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
+ "Average cleanup state latency",
+ "sdcl", PerfCountersBuilder::PRIO_USEFUL);
+ //****************************************
+
+ // Update Transaction stats
+ //****************************************
+ b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
+ "Average submit throttle latency",
+ "th_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
+ "Average submit latency",
+ "s_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
+ "Average commit latency",
+ "c_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
+ //****************************************
+
+ // Read op stats
+ //****************************************
+ b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
+ "Average read onode metadata latency",
+ "roml", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
+ "Average read I/O waiting latency",
+ "rwal", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
+ "Average checksum latency",
+ "csml", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_u64_counter(l_bluestore_read_eio, "read_eio",
+ "Read EIO errors propagated to high level callers");
+ b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
+ "Read operations that required at least one retry due to failed checksum validation",
+ "rd_r", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_read_lat, "read_lat",
+ "Average read latency",
+ "r_l", PerfCountersBuilder::PRIO_CRITICAL);
+ //****************************************
+
+ // kv_thread latencies
+ //****************************************
+ b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
+ "Average kv_thread flush latency",
+ "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
+ "Average kv_thread commit latency",
+ "kcol", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
+ "Average kv_sync thread latency",
+ "kscl", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
+ "Average kv_finalize thread latency",
+ "kfll", PerfCountersBuilder::PRIO_INTERESTING);
+ //****************************************
+
+ // write op stats
+ //****************************************
+ b.add_u64_counter(l_bluestore_write_big, "write_big",
+ "Large aligned writes into fresh blobs");
+ b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
+ "Large aligned writes into fresh blobs (bytes)",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
+ "Large aligned writes into fresh blobs (blobs)");
+ b.add_u64_counter(l_bluestore_write_big_deferred,
+ "write_big_deferred",
+ "Big overwrites using deferred");
+
+ b.add_u64_counter(l_bluestore_write_small, "write_small",
+ "Small writes into existing or sparse small blobs");
+ b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
+ "Small writes into existing or sparse small blobs (bytes)",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_write_small_unused,
+ "write_small_unused",
+ "Small writes into unused portion of existing blob");
+ b.add_u64_counter(l_bluestore_write_small_pre_read,
+ "write_small_pre_read",
+ "Small writes that required we read some data (possibly "
+ "cached) to fill out the block");
+
+ b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
+ "Sum for write-op padded bytes",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
+ "Sum for write penalty read ops");
+ b.add_u64_counter(l_bluestore_write_new, "write_new",
+ "Write into new blob");
+
+ b.add_u64_counter(l_bluestore_issued_deferred_writes,
+ "issued_deferred_writes",
+ "Total deferred writes issued");
+ b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
+ "issued_deferred_write_bytes",
+ "Total bytes in issued deferred writes",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_submitted_deferred_writes,
+ "submitted_deferred_writes",
+ "Total deferred writes submitted to disk");
+ b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
+ "submitted_deferred_write_bytes",
+ "Total bytes submitted to disk by deferred writes",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+
+ b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
+ "write_big_skipped_blobs",
+ "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
+ b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
+ "write_big_skipped_bytes",
+ "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
+ b.add_u64_counter(l_bluestore_write_small_skipped,
+ "write_small_skipped",
+ "Small writes into existing or sparse small blobs skipped due to zero detection");
+ b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
+ "write_small_skipped_bytes",
+ "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
+ //****************************************
+
+ // compressions stats
+ //****************************************
+ b.add_u64(l_bluestore_compressed, "compressed",
+ "Sum for stored compressed bytes",
+ "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
+ "Sum for bytes allocated for compressed data",
+ "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_compressed_original, "compressed_original",
+ "Sum for original bytes that were compressed",
+ "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
+ "Average compress latency",
+ "_cpl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
+ "Average decompress latency",
+ "dcpl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
+ "Sum for beneficial compress ops");
+ b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
+ "Sum for compress ops rejected due to low net gain of space");
+ //****************************************
+
+ // onode cache stats
+ //****************************************
+ b.add_u64(l_bluestore_onodes, "onodes",
+ "Number of onodes in cache");
+ b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
+ "Number of pinned onodes in cache");
+ b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
+ "Count of onode cache lookup hits",
+ "o_ht", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
+ "Count of onode cache lookup misses",
+ "o_ms", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
+ "Count of onode shard cache lookups hits");
+ b.add_u64_counter(l_bluestore_onode_shard_misses,
+ "onode_shard_misses",
+ "Count of onode shard cache lookups misses");
+ b.add_u64(l_bluestore_extents, "onode_extents",
+ "Number of extents in cache");
+ b.add_u64(l_bluestore_blobs, "onode_blobs",
+ "Number of blobs in cache");
+ //****************************************
+
+ // buffer cache stats
+ //****************************************
+ b.add_u64(l_bluestore_buffers, "buffers",
+ "Number of buffers in cache");
+ b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
+ "Number of buffer bytes in cache",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
+ "Sum for bytes of read hit in the cache",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
+ "Sum for bytes of read missed in the cache",
+ NULL,
+ PerfCountersBuilder::PRIO_DEBUGONLY,
+ unit_t(UNIT_BYTES));
+ //****************************************
+
+ // internal stats
+ //****************************************
+ b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
+ "Onode extent map reshard events");
+ b.add_u64_counter(l_bluestore_blob_split, "blob_split",
+ "Sum for blob splitting due to resharding");
+ b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
+ "Sum for extents that have been removed due to compression");
+ b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
+ "Sum for extents that have been merged due to garbage "
+ "collection");
+ //****************************************
+ // misc
+ //****************************************
+ b.add_u64_counter(l_bluestore_omap_iterator_count, "omap_iterator_count",
+ "Open omap iterators count");
+ b.add_u64_counter(l_bluestore_omap_rmkeys_count, "omap_rmkeys_count",
+ "amount of omap keys removed via rmkeys");
+ b.add_u64_counter(l_bluestore_omap_rmkey_ranges_count, "omap_rmkey_range_count",
+ "amount of omap key ranges removed via rmkeys");
+ //****************************************
+ // other client ops latencies
+ //****************************************
+ b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
+ "Average omap iterator seek_to_first call latency",
+ "osfl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
+ "Average omap iterator upper_bound call latency",
+ "oubl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
+ "Average omap iterator lower_bound call latency",
+ "olbl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
+ "Average omap iterator next call latency",
+ "onxl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
+ "Average omap get_keys call latency",
+ "ogkl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
+ "Average omap get_values call latency",
+ "ogvl", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
+ "Average omap clear call latency");
+ b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
+ "Average collection listing latency",
+ "cl_l", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
+ "Average removal latency",
+ "rm_l", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
+ "Average truncate latency",
+ "tr_l", PerfCountersBuilder::PRIO_USEFUL);
+ //****************************************
+
+ // Resulting size axis configuration for op histograms, values are in bytes
+ PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
+ "Given size (bytes)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+ 0, ///< Start at 0
+ 4096, ///< Quantization unit
+ 13, ///< Enough to cover 4+M requests
+ };
+ // Req size axis configuration for op histograms, values are in bytes
+ PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
+ "Request size (bytes)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+ 0, ///< Start at 0
+ 4096, ///< Quantization unit
+ 13, ///< Enough to cover 4+M requests
+ };
+ b.add_u64_counter_histogram(
+ l_bluestore_allocate_hist, "allocate_histogram",
+ alloc_hist_x_axis_config, alloc_hist_y_axis_config,
+ "Histogram of requested block allocations vs. given ones");
+
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+}
+
+int BlueStore::_reload_logger()
+{
+ struct store_statfs_t store_statfs;
+ int r = statfs(&store_statfs);
+ if (r >= 0) {
+ logger->set(l_bluestore_allocated, store_statfs.allocated);
+ logger->set(l_bluestore_stored, store_statfs.data_stored);
+ logger->set(l_bluestore_compressed, store_statfs.data_compressed);
+ logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
+ logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
+ }
+ return r;
+}
+
+void BlueStore::_shutdown_logger()
+{
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+}
+
+int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid)
+{
+ bluestore_bdev_label_t label;
+ int r = _read_bdev_label(cct, path, &label);
+ if (r < 0)
+ return r;
+ *fsid = label.osd_uuid;
+ return 0;
+}
+
+int BlueStore::_open_path()
+{
+ // sanity check(s)
+ ceph_assert(path_fd < 0);
+ path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
+ if (path_fd < 0) {
+ int r = -errno;
+ derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ return 0;
+}
+
+void BlueStore::_close_path()
+{
+ VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+ path_fd = -1;
+}
+
+int BlueStore::_write_bdev_label(CephContext *cct,
+ const string &path, bluestore_bdev_label_t label)
+{
+ dout(10) << __func__ << " path " << path << " label " << label << dendl;
+ bufferlist bl;
+ encode(label, bl);
+ uint32_t crc = bl.crc32c(-1);
+ encode(crc, bl);
+ ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
+ bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
+ z.zero();
+ bl.append(std::move(z));
+
+ int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT));
+ if (fd < 0) {
+ fd = -errno;
+ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+ << dendl;
+ return fd;
+ }
+ bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX);
+ int r = bl.write_fd(fd);
+ if (r < 0) {
+ derr << __func__ << " failed to write to " << path
+ << ": " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " failed to fsync " << path
+ << ": " << cpp_strerror(r) << dendl;
+ }
+out:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+}
+
+int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
+ bluestore_bdev_label_t *label)
+{
+ dout(10) << __func__ << dendl;
+ int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
+ if (fd < 0) {
+ fd = -errno;
+ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+ << dendl;
+ return fd;
+ }
+ bufferlist bl;
+ int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ if (r < 0) {
+ derr << __func__ << " failed to read from " << path
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ uint32_t crc, expected_crc;
+ auto p = bl.cbegin();
+ try {
+ decode(*label, p);
+ bufferlist t;
+ t.substr_of(bl, 0, p.get_off());
+ crc = t.crc32c(-1);
+ decode(expected_crc, p);
+ }
+ catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to decode label at offset " << p.get_off()
+ << ": " << e.what()
+ << dendl;
+ return -ENOENT;
+ }
+ if (crc != expected_crc) {
+ derr << __func__ << " bad crc on label, expected " << expected_crc
+ << " != actual " << crc << dendl;
+ return -EIO;
+ }
+ dout(10) << __func__ << " got " << *label << dendl;
+ return 0;
+}
+
+int BlueStore::_check_or_set_bdev_label(
+ string path, uint64_t size, string desc, bool create)
+{
+ bluestore_bdev_label_t label;
+ if (create) {
+ label.osd_uuid = fsid;
+ label.size = size;
+ label.btime = ceph_clock_now();
+ label.description = desc;
+ int r = _write_bdev_label(cct, path, label);
+ if (r < 0)
+ return r;
+ } else {
+ int r = _read_bdev_label(cct, path, &label);
+ if (r < 0)
+ return r;
+ if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
+ dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+ << " and fsid " << fsid << " check bypassed" << dendl;
+ } else if (label.osd_uuid != fsid) {
+ derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+ << " does not match our fsid " << fsid << dendl;
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+void BlueStore::_set_alloc_sizes(void)
+{
+ max_alloc_size = cct->_conf->bluestore_max_alloc_size;
+
+#ifdef HAVE_LIBZBD
+ ceph_assert(bdev);
+ if (bdev->is_smr()) {
+ prefer_deferred_size = 0;
+ } else
+#endif
+ if (cct->_conf->bluestore_prefer_deferred_size) {
+ prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
+ } else {
+ if (_use_rotational_settings()) {
+ prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
+ } else {
+ prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
+ }
+ }
+
+ if (cct->_conf->bluestore_deferred_batch_ops) {
+ deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
+ } else {
+ if (_use_rotational_settings()) {
+ deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
+ } else {
+ deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
+ }
+ }
+
+ dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+ << std::dec << " order " << (int)min_alloc_size_order
+ << " max_alloc_size 0x" << std::hex << max_alloc_size
+ << " prefer_deferred_size 0x" << prefer_deferred_size
+ << std::dec
+ << " deferred_batch_ops " << deferred_batch_ops
+ << dendl;
+}
+
+int BlueStore::_open_bdev(bool create)
+{
+ ceph_assert(bdev == NULL);
+ string p = path + "/block";
+ bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
+ int r = bdev->open(p);
+ if (r < 0)
+ goto fail;
+
+ if (create && cct->_conf->bdev_enable_discard) {
+ interval_set<uint64_t> whole_device;
+ whole_device.insert(0, bdev->get_size());
+ bdev->try_discard(whole_device, false);
+ }
+
+ if (bdev->supported_bdev_label()) {
+ r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
+ if (r < 0)
+ goto fail_close;
+ }
+
+ // initialize global block parameters
+ block_size = bdev->get_block_size();
+ block_mask = ~(block_size - 1);
+ block_size_order = std::countr_zero(block_size);
+ ceph_assert(block_size == 1u << block_size_order);
+ _set_max_defer_interval();
+ // and set cache_size based on device type
+ r = _set_cache_sizes();
+ if (r < 0) {
+ goto fail_close;
+ }
+ // get block dev optimal io size
+ optimal_io_size = bdev->get_optimal_io_size();
+
+ return 0;
+
+ fail_close:
+ bdev->close();
+ fail:
+ delete bdev;
+ bdev = NULL;
+ return r;
+}
+
+void BlueStore::_validate_bdev()
+{
+ ceph_assert(bdev);
+ uint64_t dev_size = bdev->get_size();
+ ceph_assert(dev_size > _get_ondisk_reserved());
+}
+
+void BlueStore::_close_bdev()
+{
+ ceph_assert(bdev);
+ bdev->close();
+ delete bdev;
+ bdev = NULL;
+}
+
+int BlueStore::_open_fm(KeyValueDB::Transaction t,
+ bool read_only,
+ bool db_avail,
+ bool fm_restore)
+{
+ int r;
+
+ dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
+ ceph_assert(fm == NULL);
+ // fm_restore means we are transitioning from null-fm to bitmap-fm
+ ceph_assert(!fm_restore || (freelist_type != "null"));
+ // fm restore must pass in a valid transaction
+ ceph_assert(!fm_restore || (t != nullptr));
+
+ // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+ bool can_have_null_fm = !is_db_rotational() &&
+ !read_only &&
+ db_avail &&
+ cct->_conf->bluestore_allocation_from_file &&
+ !bdev->is_smr();
+
+ // When allocation-info is stored in a single file we set freelist_type to "null"
+ if (can_have_null_fm) {
+ freelist_type = "null";
+ need_to_destage_allocation_file = true;
+ }
+ fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
+ ceph_assert(fm);
+ if (t) {
+ // create mode. initialize freespace
+ dout(20) << __func__ << " initializing freespace" << dendl;
+ {
+ bufferlist bl;
+ bl.append(freelist_type);
+ t->set(PREFIX_SUPER, "freelist_type", bl);
+ }
+ // being able to allocate in units less than bdev block size
+ // seems to be a bad idea.
+ ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
+
+ uint64_t alloc_size = min_alloc_size;
+ if (bdev->is_smr() && freelist_type != "zoned") {
+ derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
+ << dendl;
+ return -EINVAL;
+ }
+ if (!bdev->is_smr() && freelist_type == "zoned") {
+ derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
+ << dendl;
+ return -EINVAL;
+ }
+
+ fm->create(bdev->get_size(), alloc_size,
+ zone_size, first_sequential_zone,
+ t);
+
+ // allocate superblock reserved space. note that we do not mark
+ // bluefs space as allocated in the freelist; we instead rely on
+ // bluefs doing that itself.
+ auto reserved = _get_ondisk_reserved();
+ if (fm_restore) {
+ // we need to allocate the full space in restore case
+ // as later we will add free-space marked in the allocator file
+ fm->allocate(0, bdev->get_size(), t);
+ } else {
+ // allocate superblock reserved space. note that we do not mark
+ // bluefs space as allocated in the freelist; we instead rely on
+ // bluefs doing that itself.
+ fm->allocate(0, reserved, t);
+ }
+ // debug code - not needed for NULL FM
+ if (cct->_conf->bluestore_debug_prefill > 0) {
+ uint64_t end = bdev->get_size() - reserved;
+ dout(1) << __func__ << " pre-fragmenting freespace, using "
+ << cct->_conf->bluestore_debug_prefill << " with max free extent "
+ << cct->_conf->bluestore_debug_prefragment_max << dendl;
+ uint64_t start = p2roundup(reserved, min_alloc_size);
+ uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
+ float r = cct->_conf->bluestore_debug_prefill;
+ r /= 1.0 - r;
+ bool stop = false;
+
+ while (!stop && start < end) {
+ uint64_t l = (rand() % max_b + 1) * min_alloc_size;
+ if (start + l > end) {
+ l = end - start;
+ l = p2align(l, min_alloc_size);
+ }
+ ceph_assert(start + l <= end);
+
+ uint64_t u = 1 + (uint64_t)(r * (double)l);
+ u = p2roundup(u, min_alloc_size);
+ if (start + l + u > end) {
+ u = end - (start + l);
+ // trim to align so we don't overflow again
+ u = p2align(u, min_alloc_size);
+ stop = true;
+ }
+ ceph_assert(start + l + u <= end);
+
+ dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
+ << " use 0x" << u << std::dec << dendl;
+
+ if (u == 0) {
+ // break if u has been trimmed to nothing
+ break;
+ }
+
+ fm->allocate(start + l, u, t);
+ start += l + u;
+ }
+ }
+ r = _write_out_fm_meta(0);
+ ceph_assert(r == 0);
+ } else {
+ if (can_have_null_fm) {
+ commit_to_null_manager();
+ }
+ r = fm->init(db, read_only,
+ [&](const std::string& key, std::string* result) {
+ return read_meta(key, result);
+ });
+ if (r < 0) {
+ derr << __func__ << " failed: " << cpp_strerror(r) << dendl;
+ delete fm;
+ fm = NULL;
+ return r;
+ }
+ }
+ // if space size tracked by free list manager is that higher than actual
+ // dev size one can hit out-of-space allocation which will result
+ // in data loss and/or assertions
+ // Probably user altered the device size somehow.
+ // The only fix for now is to redeploy OSD.
+ if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
+ ostringstream ss;
+ ss << "slow device size mismatch detected, "
+ << " fm size(" << fm->get_size()
+ << ") > slow device size(" << bdev->get_size()
+ << "), Please stop using this OSD as it might cause data loss.";
+ _set_disk_size_mismatch_alert(ss.str());
+ }
+ return 0;
+}
+
+void BlueStore::_close_fm()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(fm);
+ fm->shutdown();
+ delete fm;
+ fm = NULL;
+}
+
+int BlueStore::_write_out_fm_meta(uint64_t target_size)
+{
+ int r = 0;
+ string p = path + "/block";
+
+ std::vector<std::pair<string, string>> fm_meta;
+ fm->get_meta(target_size, &fm_meta);
+
+ for (auto& m : fm_meta) {
+ r = write_meta(m.first, m.second);
+ ceph_assert(r == 0);
+ }
+ return r;
+}
+
+int BlueStore::_create_alloc()
+{
+ ceph_assert(alloc == NULL);
+ ceph_assert(shared_alloc.a == NULL);
+ ceph_assert(bdev->get_size());
+
+ uint64_t alloc_size = min_alloc_size;
+
+ std::string allocator_type = cct->_conf->bluestore_allocator;
+
+#ifdef HAVE_LIBZBD
+ if (freelist_type == "zoned") {
+ allocator_type = "zoned";
+ }
+#endif
+
+ alloc = Allocator::create(
+ cct, allocator_type,
+ bdev->get_size(),
+ alloc_size,
+ zone_size,
+ first_sequential_zone,
+ "block");
+ if (!alloc) {
+ lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
+ << dendl;
+ return -EINVAL;
+ }
+
+#ifdef HAVE_LIBZBD
+ if (freelist_type == "zoned") {
+ Allocator *a = Allocator::create(
+ cct, cct->_conf->bluestore_allocator,
+ bdev->get_conventional_region_size(),
+ alloc_size,
+ 0, 0,
+ "zoned_block");
+ if (!a) {
+ lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
+ << " allocator" << dendl;
+ delete alloc;
+ return -EINVAL;
+ }
+ shared_alloc.set(a, alloc_size);
+ } else
+#endif
+ {
+ // BlueFS will share the same allocator
+ shared_alloc.set(alloc, alloc_size);
+ }
+
+ return 0;
+}
+
+int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
+{
+ int r = _create_alloc();
+ if (r < 0) {
+ return r;
+ }
+ ceph_assert(alloc != NULL);
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ auto a = dynamic_cast<ZonedAllocator*>(alloc);
+ ceph_assert(a);
+ auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+ ceph_assert(f);
+ vector<uint64_t> wp = bdev->get_zones();
+ vector<zone_state_t> zones = f->get_zone_states(db);
+ ceph_assert(wp.size() == zones.size());
+
+ // reconcile zone state
+ auto num_zones = bdev->get_size() / zone_size;
+ for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
+ ceph_assert(wp[i] >= i * zone_size);
+ ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
+ uint64_t p = wp[i] - i * zone_size;
+ if (zones[i].write_pointer > p) {
+ derr << __func__ << " zone 0x" << std::hex << i
+ << " bluestore write pointer 0x" << zones[i].write_pointer
+ << " > device write pointer 0x" << p
+ << std::dec << " -- VERY SUSPICIOUS!" << dendl;
+ } else if (zones[i].write_pointer < p) {
+ // this is "normal" in that it can happen after any crash (if we have a
+ // write in flight but did not manage to commit the transaction)
+ auto delta = p - zones[i].write_pointer;
+ dout(1) << __func__ << " zone 0x" << std::hex << i
+ << " device write pointer 0x" << p
+ << " > bluestore pointer 0x" << zones[i].write_pointer
+ << ", advancing 0x" << delta << std::dec << dendl;
+ (*zone_adjustments)[zones[i].write_pointer] = delta;
+ zones[i].num_dead_bytes += delta;
+ zones[i].write_pointer = p;
+ }
+ }
+
+ // start with conventional zone "free" (bluefs may adjust this when it starts up)
+ auto reserved = _get_ondisk_reserved();
+ // for now we require a conventional zone
+ ceph_assert(bdev->get_conventional_region_size());
+ ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
+ shared_alloc.a->init_add_free(
+ reserved,
+ p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
+
+ // init sequential zone based on the device's write pointers
+ a->init_from_zone_pointers(std::move(zones));
+ dout(1) << __func__
+ << " loaded zone pointers: "
+ << std::hex
+ << ", allocator type " << alloc->get_type()
+ << ", capacity 0x" << alloc->get_capacity()
+ << ", block size 0x" << alloc->get_block_size()
+ << ", free 0x" << alloc->get_free()
+ << ", fragmentation " << alloc->get_fragmentation()
+ << std::dec << dendl;
+
+ return 0;
+ }
+#endif
+
+ uint64_t num = 0, bytes = 0;
+ utime_t start_time = ceph_clock_now();
+ if (!fm->is_null_manager()) {
+ // This is the original path - loading allocation map from RocksDB and feeding into the allocator
+ dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
+ // initialize from freelist
+ fm->enumerate_reset();
+ uint64_t offset, length;
+ while (fm->enumerate_next(db, &offset, &length)) {
+ alloc->init_add_free(offset, length);
+ ++num;
+ bytes += length;
+ }
+ fm->enumerate_reset();
+
+ utime_t duration = ceph_clock_now() - start_time;
+ dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
+ alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
+ } else {
+ // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
+
+ if (!cct->_conf->bluestore_allocation_from_file) {
+ derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
+ derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
+ return -ENOTSUP; // Operation not supported
+ }
+ if (restore_allocator(alloc, &num, &bytes) == 0) {
+ dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
+ } else {
+ // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
+ dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
+ // if failed must recover from on-disk ONode internal state
+ if (read_allocation_from_drive_on_startup() != 0) {
+ derr << __func__ << "::NCB::Failed Recovery" << dendl;
+ derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
+ derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
+ return -ENOTRECOVERABLE;
+ }
+ }
+ }
+ dout(1) << __func__
+ << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
+ << std::hex
+ << ", allocator type " << alloc->get_type()
+ << ", capacity 0x" << alloc->get_capacity()
+ << ", block size 0x" << alloc->get_block_size()
+ << ", free 0x" << alloc->get_free()
+ << ", fragmentation " << alloc->get_fragmentation()
+ << std::dec << dendl;
+
+ return 0;
+}
+
+void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
+{
+ int r = 0;
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ if (zone_adjustments.empty()) {
+ return;
+ }
+ dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
+ auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+ ceph_assert(f);
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (auto& i : zone_adjustments) {
+ // allocate AND release since this gap is now dead space
+ // note that the offset is imprecise, but only need to select the zone
+ f->allocate(i.first, i.second, t);
+ f->release(i.first, i.second, t);
+ }
+ r = db->submit_transaction_sync(t);
+ } else
+#endif
+ if (fm->is_null_manager()) {
+ // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
+ // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
+ // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
+ // to recovery from RocksDB::ONodes
+ r = invalidate_allocation_file_on_bluefs();
+ }
+ ceph_assert(r >= 0);
+}
+
+void BlueStore::_close_alloc()
+{
+ ceph_assert(bdev);
+ bdev->discard_drain();
+
+ ceph_assert(alloc);
+ alloc->shutdown();
+ delete alloc;
+
+ ceph_assert(shared_alloc.a);
+ if (alloc != shared_alloc.a) {
+ shared_alloc.a->shutdown();
+ delete shared_alloc.a;
+ }
+
+ shared_alloc.reset();
+ alloc = nullptr;
+}
+
+int BlueStore::_open_fsid(bool create)
+{
+ ceph_assert(fsid_fd < 0);
+ int flags = O_RDWR|O_CLOEXEC;
+ if (create)
+ flags |= O_CREAT;
+ fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+ if (fsid_fd < 0) {
+ int err = -errno;
+ derr << __func__ << " " << cpp_strerror(err) << dendl;
+ return err;
+ }
+ return 0;
+}
+
+int BlueStore::_read_fsid(uuid_d *uuid)
+{
+ char fsid_str[40];
+ memset(fsid_str, 0, sizeof(fsid_str));
+ int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+ if (ret < 0) {
+ derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (ret > 36)
+ fsid_str[36] = 0;
+ else
+ fsid_str[ret] = 0;
+ if (!uuid->parse(fsid_str)) {
+ derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int BlueStore::_write_fsid()
+{
+ int r = ::ftruncate(fsid_fd, 0);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ string str = stringify(fsid) + "\n";
+ r = safe_write(fsid_fd, str.c_str(), str.length());
+ if (r < 0) {
+ derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ r = ::fsync(fsid_fd);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+void BlueStore::_close_fsid()
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+}
+
+int BlueStore::_lock_fsid()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ int r = ::fcntl(fsid_fd, F_SETLK, &l);
+ if (r < 0) {
+ int err = errno;
+ derr << __func__ << " failed to lock " << path << "/fsid"
+ << " (is another ceph-osd still running?)"
+ << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ return 0;
+}
+
+bool BlueStore::is_rotational()
+{
+ if (bdev) {
+ return bdev->is_rotational();
+ }
+
+ bool rotational = true;
+ int r = _open_path();
+ if (r < 0)
+ goto out;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+ rotational = bdev->is_rotational();
+ _close_bdev();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ out:
+ return rotational;
+}
+
+bool BlueStore::is_journal_rotational()
+{
+ if (!bluefs) {
+ dout(5) << __func__ << " bluefs disabled, default to store media type"
+ << dendl;
+ return is_rotational();
+ }
+ dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
+ return bluefs->wal_is_rotational();
+}
+
+bool BlueStore::is_db_rotational()
+{
+ if (!bluefs) {
+ dout(5) << __func__ << " bluefs disabled, default to store media type"
+ << dendl;
+ return is_rotational();
+ }
+ dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl;
+ return bluefs->db_is_rotational();
+}
+
+bool BlueStore::_use_rotational_settings()
+{
+ if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
+ return true;
+ }
+ if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
+ return false;
+ }
+ return bdev->is_rotational();
+}
+
+bool BlueStore::is_statfs_recoverable() const
+{
+ // abuse fm for now
+ return has_null_manager();
+}
+
+bool BlueStore::test_mount_in_use()
+{
+ // most error conditions mean the mount is not in use (e.g., because
+ // it doesn't exist). only if we fail to lock do we conclude it is
+ // in use.
+ bool ret = false;
+ int r = _open_path();
+ if (r < 0)
+ return false;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+ r = _lock_fsid();
+ if (r < 0)
+ ret = true; // if we can't lock, it is in use
+ _close_fsid();
+ out_path:
+ _close_path();
+ return ret;
+}
+
+int BlueStore::_minimal_open_bluefs(bool create)
+{
+ int r;
+ bluefs = new BlueFS(cct);
+
+ string bfn;
+ struct stat st;
+
+ bfn = path + "/block.db";
+ if (::stat(bfn.c_str(), &st) == 0) {
+ r = bluefs->add_block_device(
+ BlueFS::BDEV_DB, bfn,
+ create && cct->_conf->bdev_enable_discard,
+ SUPER_RESERVED);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB),
+ "bluefs db", create);
+ if (r < 0) {
+ derr << __func__
+ << " check block device(" << bfn << ") label returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+ bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+ bluefs_layout.dedicated_db = true;
+ } else {
+ r = -errno;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ // shared device
+ bfn = path + "/block";
+ // never trim here
+ r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
+ 0, // no need to provide valid 'reserved' for shared dev
+ &shared_alloc);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ bfn = path + "/block.wal";
+ if (::stat(bfn.c_str(), &st) == 0) {
+ r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
+ create && cct->_conf->bdev_enable_discard,
+ BDEV_LABEL_BLOCK_SIZE);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL),
+ "bluefs wal", create);
+ if (r < 0) {
+ derr << __func__ << " check block device(" << bfn
+ << ") label returned: " << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ bluefs_layout.dedicated_wal = true;
+ } else {
+ r = 0;
+ if (::lstat(bfn.c_str(), &st) != -1) {
+ r = -errno;
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+ return 0;
+
+free_bluefs:
+ ceph_assert(bluefs);
+ delete bluefs;
+ bluefs = NULL;
+ return r;
+}
+
+int BlueStore::_open_bluefs(bool create, bool read_only)
+{
+ int r = _minimal_open_bluefs(create);
+ if (r < 0) {
+ return r;
+ }
+ BlueFSVolumeSelector* vselector = nullptr;
+ if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ||
+ cct->_conf->bluestore_volume_selection_policy == "use_some_extra_enforced" ||
+ cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
+
+ string options = cct->_conf->bluestore_rocksdb_options;
+ string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
+ if (!options_annex.empty()) {
+ if (!options.empty() &&
+ *options.rbegin() != ',') {
+ options += ',';
+ }
+ options += options_annex;
+ }
+
+ rocksdb::Options rocks_opts;
+ r = RocksDBStore::ParseOptionsFromStringStatic(
+ cct,
+ options,
+ rocks_opts,
+ nullptr);
+ if (r < 0) {
+ return r;
+ }
+ if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
+ vselector = new FitToFastVolumeSelector(
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
+ } else {
+ double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
+ vselector =
+ new RocksDBBlueFSVolumeSelector(
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
+ 1024 * 1024 * 1024, //FIXME: set expected l0 size here
+ rocks_opts.max_bytes_for_level_base,
+ rocks_opts.max_bytes_for_level_multiplier,
+ reserved_factor,
+ cct->_conf->bluestore_volume_selection_reserved,
+ cct->_conf->bluestore_volume_selection_policy.find("use_some_extra")
+ == 0);
+ }
+ }
+ if (create) {
+ bluefs->mkfs(fsid, bluefs_layout);
+ }
+ bluefs->set_volume_selector(vselector);
+ r = bluefs->mount();
+ if (r < 0) {
+ derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
+ }
+ ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
+ return r;
+}
+
+void BlueStore::_close_bluefs()
+{
+ bluefs->umount(db_was_opened_read_only);
+ _minimal_close_bluefs();
+}
+
+void BlueStore::_minimal_close_bluefs()
+{
+ delete bluefs;
+ bluefs = NULL;
+}
+
+int BlueStore::_is_bluefs(bool create, bool* ret)
+{
+ if (create) {
+ *ret = cct->_conf->bluestore_bluefs;
+ } else {
+ string s;
+ int r = read_meta("bluefs", &s);
+ if (r < 0) {
+ derr << __func__ << " unable to read 'bluefs' meta" << dendl;
+ return -EIO;
+ }
+ if (s == "1") {
+ *ret = true;
+ } else if (s == "0") {
+ *ret = false;
+ } else {
+ derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
+ << dendl;
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+/*
+* opens both DB and dependant super_meta, FreelistManager and allocator
+* in the proper order
+*/
+int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
+{
+ dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
+ {
+ string type;
+ int r = read_meta("type", &type);
+ if (r < 0) {
+ derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (type != "bluestore") {
+ derr << __func__ << " expected bluestore, but type is " << type << dendl;
+ return -EIO;
+ }
+ }
+
+ // SMR devices may require a freelist adjustment, but that can only happen after
+ // the db is read-write. we'll stash pending changes here.
+ std::map<uint64_t, uint64_t> zone_adjustments;
+
+ int r = _open_path();
+ if (r < 0)
+ return r;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+
+ // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
+ // (might need to open if failed to restore from file)
+
+ // open in read-only first to read FM list and init allocator
+ // as they might be needed for some BlueFS procedures
+ r = _open_db(false, false, true);
+ if (r < 0)
+ goto out_bdev;
+
+ r = _open_super_meta();
+ if (r < 0) {
+ goto out_db;
+ }
+
+ r = _open_fm(nullptr, true, false);
+ if (r < 0)
+ goto out_db;
+
+ r = _init_alloc(&zone_adjustments);
+ if (r < 0)
+ goto out_fm;
+
+ // Re-open in the proper mode(s).
+
+ // Can't simply bypass second open for read-only mode as we need to
+ // load allocated extents from bluefs into allocator.
+ // And now it's time to do that
+ //
+ _close_db();
+ r = _open_db(false, to_repair, read_only);
+ if (r < 0) {
+ goto out_alloc;
+ }
+
+ if (!read_only) {
+ _post_init_alloc(zone_adjustments);
+ }
+
+ // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+ // we can't change bluestore allocation so no need to invlidate allocation-file
+ if (fm->is_null_manager() && !read_only && !to_repair) {
+ // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
+ // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
+ // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
+ // to recovery from RocksDB::ONodes
+ r = invalidate_allocation_file_on_bluefs();
+ if (r != 0) {
+ derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
+ goto out_alloc;
+ }
+ }
+
+ // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+ if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
+#ifdef HAVE_LIBZBD
+ && !bdev->is_smr()
+#endif
+ ) {
+ dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
+ commit_to_null_manager();
+ need_to_destage_allocation_file = true;
+ dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
+ }
+
+ return 0;
+
+out_alloc:
+ _close_alloc();
+out_fm:
+ _close_fm();
+ out_db:
+ _close_db();
+ out_bdev:
+ _close_bdev();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ return r;
+}
+
+void BlueStore::_close_db_and_around()
+{
+ if (db) {
+ _close_db();
+ }
+ _close_around_db();
+}
+
+void BlueStore::_close_around_db()
+{
+ if (bluefs) {
+ _close_bluefs();
+ }
+ _close_fm();
+ _close_alloc();
+ _close_bdev();
+ _close_fsid();
+ _close_path();
+}
+
+int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
+{
+ _kv_only = true;
+ int r = _open_db_and_around(false, to_repair);
+ if (r == 0) {
+ *pdb = db;
+ } else {
+ *pdb = nullptr;
+ }
+ return r;
+}
+
+int BlueStore::close_db_environment()
+{
+ if (db) {
+ delete db;
+ db = nullptr;
+ }
+ _close_around_db();
+ return 0;
+}
+
+/* gets access to bluefs supporting RocksDB */
+BlueFS* BlueStore::get_bluefs() {
+ return bluefs;
+}
+
+int BlueStore::_prepare_db_environment(bool create, bool read_only,
+ std::string* _fn, std::string* _kv_backend)
+{
+ int r;
+ ceph_assert(!db);
+ std::string& fn=*_fn;
+ std::string& kv_backend=*_kv_backend;
+ fn = path + "/db";
+ std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
+
+ if (create) {
+ kv_backend = cct->_conf->bluestore_kvbackend;
+ } else {
+ r = read_meta("kv_backend", &kv_backend);
+ if (r < 0) {
+ derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
+ return -EIO;
+ }
+ }
+ dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+ bool do_bluefs;
+ r = _is_bluefs(create, &do_bluefs);
+ if (r < 0) {
+ return r;
+ }
+ dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
+
+ map<string,string> kv_options;
+ // force separate wal dir for all new deployments.
+ kv_options["separate_wal_dir"] = 1;
+ rocksdb::Env *env = NULL;
+ if (do_bluefs) {
+ dout(10) << __func__ << " initializing bluefs" << dendl;
+ if (kv_backend != "rocksdb") {
+ derr << " backend must be rocksdb to use bluefs" << dendl;
+ return -EINVAL;
+ }
+
+ r = _open_bluefs(create, read_only);
+ if (r < 0) {
+ return r;
+ }
+
+ if (cct->_conf->bluestore_bluefs_env_mirror) {
+ rocksdb::Env* a = new BlueRocksEnv(bluefs);
+ rocksdb::Env* b = rocksdb::Env::Default();
+ if (create) {
+ string cmd = "rm -rf " + path + "/db " +
+ path + "/db.slow " +
+ path + "/db.wal";
+ int r = system(cmd.c_str());
+ (void)r;
+ }
+ env = new rocksdb::EnvMirror(b, a, false, true);
+ } else {
+ env = new BlueRocksEnv(bluefs);
+
+ // simplify the dir names, too, as "seen" by rocksdb
+ fn = "db";
+ }
+ BlueFSVolumeSelector::paths paths;
+ bluefs->get_vselector_paths(fn, paths);
+
+ {
+ ostringstream db_paths;
+ bool first = true;
+ for (auto& p : paths) {
+ if (!first) {
+ db_paths << " ";
+ }
+ first = false;
+ db_paths << p.first << "," << p.second;
+
+ }
+ kv_options["db_paths"] = db_paths.str();
+ dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
+ }
+
+ if (create) {
+ for (auto& p : paths) {
+ env->CreateDir(p.first);
+ }
+ // Selectors don't provide wal path so far hence create explicitly
+ env->CreateDir(fn + ".wal");
+ } else {
+ std::vector<std::string> res;
+ // check for dir presence
+ auto r = env->GetChildren(fn+".wal", &res);
+ if (r.IsNotFound()) {
+ kv_options.erase("separate_wal_dir");
+ }
+ }
+ } else {
+ string walfn = path + "/db.wal";
+
+ if (create) {
+ int r = ::mkdir(fn.c_str(), 0755);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ // wal_dir, too!
+ r = ::mkdir(walfn.c_str(), 0755);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " failed to create " << walfn
+ << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ } else {
+ struct stat st;
+ r = ::stat(walfn.c_str(), &st);
+ if (r < 0 && errno == ENOENT) {
+ kv_options.erase("separate_wal_dir");
+ }
+ }
+ }
+
+
+ db = KeyValueDB::create(cct,
+ kv_backend,
+ fn,
+ kv_options,
+ static_cast<void*>(env));
+ if (!db) {
+ derr << __func__ << " error creating db" << dendl;
+ if (bluefs) {
+ _close_bluefs();
+ }
+ // delete env manually here since we can't depend on db to do this
+ // under this case
+ delete env;
+ env = NULL;
+ return -EIO;
+ }
+
+ FreelistManager::setup_merge_operators(db, freelist_type);
+ db->set_merge_operator(PREFIX_STAT, merge_op);
+ db->set_cache_size(cache_kv_ratio * cache_size);
+ return 0;
+}
+
+int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
+{
+ int r;
+ ceph_assert(!(create && read_only));
+ string options;
+ string options_annex;
+ stringstream err;
+ string kv_dir_fn;
+ string kv_backend;
+ std::string sharding_def;
+ // prevent write attempts to BlueFS in case we failed before BlueFS was opened
+ db_was_opened_read_only = true;
+ r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
+ if (r < 0) {
+ derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
+ return -EIO;
+ }
+ // if reached here then BlueFS is already opened
+ db_was_opened_read_only = read_only;
+ dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
+ if (kv_backend == "rocksdb") {
+ options = cct->_conf->bluestore_rocksdb_options;
+ options_annex = cct->_conf->bluestore_rocksdb_options_annex;
+ if (!options_annex.empty()) {
+ if (!options.empty() &&
+ *options.rbegin() != ',') {
+ options += ',';
+ }
+ options += options_annex;
+ }
+
+ if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
+ sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
+ }
+ }
+
+ db->init(options);
+ if (to_repair_db)
+ return 0;
+ if (create) {
+ r = db->create_and_open(err, sharding_def);
+ } else {
+ // we pass in cf list here, but it is only used if the db already has
+ // column families created.
+ r = read_only ?
+ db->open_read_only(err, sharding_def) :
+ db->open(err, sharding_def);
+ }
+ if (r) {
+ derr << __func__ << " erroring opening db: " << err.str() << dendl;
+ _close_db();
+ return -EIO;
+ }
+ dout(1) << __func__ << " opened " << kv_backend
+ << " path " << kv_dir_fn << " options " << options << dendl;
+ return 0;
+}
+
+void BlueStore::_close_db()
+{
+ dout(10) << __func__ << ":read_only=" << db_was_opened_read_only
+ << " fm=" << fm
+ << " destage_alloc_file=" << need_to_destage_allocation_file
+ << " per_pool=" << per_pool_stat_collection
+ << " pool stats=" << osd_pools.size()
+ << dendl;
+ bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file;
+ if (do_destage && is_statfs_recoverable()) {
+ auto t = db->get_transaction();
+ store_statfs_t s;
+ if (per_pool_stat_collection) {
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+ uint64_t pool_id;
+ for (it->upper_bound(string()); it->valid(); it->next()) {
+ int r = get_key_pool_stat(it->key(), &pool_id);
+ if (r >= 0) {
+ dout(10) << __func__ << " wiping statfs for: " << pool_id << dendl;
+ } else {
+ derr << __func__ << " wiping invalid statfs key: " << it->key() << dendl;
+ }
+ t->rmkey(PREFIX_STAT, it->key());
+ }
+
+ std::lock_guard l(vstatfs_lock);
+ for(auto &p : osd_pools) {
+ string key;
+ get_pool_stat_key(p.first, &key);
+ bufferlist bl;
+ if (!p.second.is_empty()) {
+ p.second.encode(bl);
+ p.second.publish(&s);
+ t->set(PREFIX_STAT, key, bl);
+ dout(10) << __func__ << " persisting: "
+ << p.first << "->" << s
+ << dendl;
+ }
+ }
+ } else {
+ bufferlist bl;
+ {
+ std::lock_guard l(vstatfs_lock);
+ vstatfs.encode(bl);
+ vstatfs.publish(&s);
+ }
+ t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ dout(10) << __func__ << "persisting: " << s << dendl;
+ }
+ int r = db->submit_transaction_sync(t);
+ dout(10) << __func__ << " statfs persisted." << dendl;
+ ceph_assert(r >= 0);
+ }
+ ceph_assert(db);
+ delete db;
+ db = nullptr;
+
+ if (do_destage && fm && fm->is_null_manager()) {
+ int ret = store_allocator(alloc);
+ if (ret != 0) {
+ derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
+ }
+ }
+
+ if (bluefs) {
+ _close_bluefs();
+ }
+}
+
+void BlueStore::_dump_alloc_on_failure()
+{
+ auto dump_interval =
+ cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
+ if (dump_interval > 0 &&
+ next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
+ shared_alloc.a->dump();
+ next_dump_on_bluefs_alloc_failure = ceph_clock_now();
+ next_dump_on_bluefs_alloc_failure += dump_interval;
+ }
+}
+
+int BlueStore::_open_collections()
+{
+ if (!coll_map.empty()) {
+ // could be opened from another path
+ dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
+ return 0;
+ }
+
+ dout(10) << __func__ << dendl;
+ collections_had_errors = false;
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+ size_t load_cnt = 0;
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+ coll_t cid;
+ if (cid.parse(it->key())) {
+ auto c = ceph::make_ref<Collection>(
+ this,
+ onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+ buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
+ cid);
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ try {
+ decode(c->cnode, p);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to decode cnode, key:"
+ << pretty_binary_string(it->key()) << dendl;
+ return -EIO;
+ }
+ dout(20) << __func__ << " opened " << cid << " " << c
+ << " " << c->cnode << dendl;
+ _osr_attach(c.get());
+ coll_map[cid] = c;
+ load_cnt++;
+ } else {
+ derr << __func__ << " unrecognized collection " << it->key() << dendl;
+ collections_had_errors = true;
+ }
+ }
+ dout(10) << __func__ << " collections loaded: " << load_cnt
+ << dendl;
+ return 0;
+}
+
+void BlueStore::_fsck_collections(int64_t* errors)
+{
+ if (collections_had_errors) {
+ dout(10) << __func__ << dendl;
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+ coll_t cid;
+ if (!cid.parse(it->key())) {
+ derr << __func__ << " unrecognized collection " << it->key() << dendl;
+ if (errors) {
+ (*errors)++;
+ }
+ }
+ }
+ }
+}
+
+void BlueStore::_set_per_pool_omap()
+{
+ per_pool_omap = OMAP_BULK;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "per_pool_omap", &bl);
+ if (bl.length()) {
+ auto s = bl.to_str();
+ if (s == stringify(OMAP_PER_POOL)) {
+ per_pool_omap = OMAP_PER_POOL;
+ } else if (s == stringify(OMAP_PER_PG)) {
+ per_pool_omap = OMAP_PER_PG;
+ } else {
+ ceph_assert(s == stringify(OMAP_BULK));
+ }
+ dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
+ } else {
+ dout(10) << __func__ << " per_pool_omap not present" << dendl;
+ }
+ _check_no_per_pg_or_pool_omap_alert();
+}
+
+void BlueStore::_open_statfs()
+{
+ osd_pools.clear();
+ vstatfs.reset();
+
+ bufferlist bl;
+ int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
+ if (r >= 0) {
+ per_pool_stat_collection = false;
+ if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
+ auto it = bl.cbegin();
+ vstatfs.decode(it);
+ dout(10) << __func__ << " store_statfs is found" << dendl;
+ } else {
+ dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
+ }
+ _check_legacy_statfs_alert();
+ } else {
+ per_pool_stat_collection = true;
+ dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+
+ uint64_t pool_id;
+ int r = get_key_pool_stat(it->key(), &pool_id);
+ ceph_assert(r == 0);
+
+ bufferlist bl;
+ bl = it->value();
+ auto p = bl.cbegin();
+ auto& st = osd_pools[pool_id];
+ try {
+ st.decode(p);
+ vstatfs += st;
+
+ dout(10) << __func__ << " pool " << std::hex << pool_id
+ << " statfs(hex) " << st
+ << std::dec << dendl;
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to decode pool stats, key:"
+ << pretty_binary_string(it->key()) << dendl;
+ }
+ }
+ }
+ dout(10) << __func__ << " statfs " << std::hex
+ << vstatfs << std::dec << dendl;
+
+}
+
+int BlueStore::_setup_block_symlink_or_file(
+ string name,
+ string epath,
+ uint64_t size,
+ bool create)
+{
+ dout(20) << __func__ << " name " << name << " path " << epath
+ << " size " << size << " create=" << (int)create << dendl;
+ int r = 0;
+ int flags = O_RDWR|O_CLOEXEC;
+ if (create)
+ flags |= O_CREAT;
+ if (epath.length()) {
+ r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to create " << name << " symlink to "
+ << epath << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
+ int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
+ if (fd < 0) {
+ r = -errno;
+ derr << __func__ << " failed to open " << epath << " file: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ // write the Transport ID of the NVMe device
+ // a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0"
+ // where "0000:02:00.0" is the selector of a PCI device, see
+ // the first column of "lspci -mm -n -D"
+ // a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420"
+ string trid = epath.substr(strlen(SPDK_PREFIX));
+ r = ::write(fd, trid.c_str(), trid.size());
+ ceph_assert(r == static_cast<int>(trid.size()));
+ dout(1) << __func__ << " created " << name << " symlink to "
+ << epath << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ }
+ if (size) {
+ int fd = ::openat(path_fd, name.c_str(), flags, 0644);
+ if (fd >= 0) {
+ // block file is present
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r == 0 &&
+ S_ISREG(st.st_mode) && // if it is a regular file
+ st.st_size == 0) { // and is 0 bytes
+ r = ::ftruncate(fd, size);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to resize " << name << " file to "
+ << size << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+
+ if (cct->_conf->bluestore_block_preallocate_file) {
+ r = ::ceph_posix_fallocate(fd, 0, size);
+ if (r > 0) {
+ derr << __func__ << " failed to prefallocate " << name << " file to "
+ << size << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return -r;
+ }
+ }
+ dout(1) << __func__ << " resized " << name << " file to "
+ << byte_u_t(size) << dendl;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ } else {
+ int r = -errno;
+ if (r != -ENOENT) {
+ derr << __func__ << " failed to open " << name << " file: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+int BlueStore::mkfs()
+{
+ dout(1) << __func__ << " path " << path << dendl;
+ int r;
+ uuid_d old_fsid;
+ uint64_t reserved;
+ if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+ derr << __func__ << " osd_max_object_size "
+ << cct->_conf->osd_max_object_size << " > bluestore max "
+ << OBJECT_MAX_SIZE << dendl;
+ return -EINVAL;
+ }
+
+ {
+ string done;
+ r = read_meta("mkfs_done", &done);
+ if (r == 0) {
+ dout(1) << __func__ << " already created" << dendl;
+ if (cct->_conf->bluestore_fsck_on_mkfs) {
+ r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+ if (r < 0) {
+ derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ if (r > 0) {
+ derr << __func__ << " fsck found " << r << " errors" << dendl;
+ r = -EIO;
+ }
+ }
+ return r; // idempotent
+ }
+ }
+
+ {
+ string type;
+ r = read_meta("type", &type);
+ if (r == 0) {
+ if (type != "bluestore") {
+ derr << __func__ << " expected bluestore, but type is " << type << dendl;
+ return -EIO;
+ }
+ } else {
+ r = write_meta("type", "bluestore");
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = _open_path();
+ if (r < 0)
+ return r;
+
+ r = _open_fsid(true);
+ if (r < 0)
+ goto out_path_fd;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_close_fsid;
+
+ r = _read_fsid(&old_fsid);
+ if (r < 0 || old_fsid.is_zero()) {
+ if (fsid.is_zero()) {
+ fsid.generate_random();
+ dout(1) << __func__ << " generated fsid " << fsid << dendl;
+ } else {
+ dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+ }
+ // we'll write it later.
+ } else {
+ if (!fsid.is_zero() && fsid != old_fsid) {
+ derr << __func__ << " on-disk fsid " << old_fsid
+ << " != provided " << fsid << dendl;
+ r = -EINVAL;
+ goto out_close_fsid;
+ }
+ fsid = old_fsid;
+ }
+
+ r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
+ cct->_conf->bluestore_block_size,
+ cct->_conf->bluestore_block_create);
+ if (r < 0)
+ goto out_close_fsid;
+ if (cct->_conf->bluestore_bluefs) {
+ r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
+ cct->_conf->bluestore_block_wal_size,
+ cct->_conf->bluestore_block_wal_create);
+ if (r < 0)
+ goto out_close_fsid;
+ r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
+ cct->_conf->bluestore_block_db_size,
+ cct->_conf->bluestore_block_db_create);
+ if (r < 0)
+ goto out_close_fsid;
+ }
+
+ r = _open_bdev(true);
+ if (r < 0)
+ goto out_close_fsid;
+
+ // choose freelist manager
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ freelist_type = "zoned";
+ zone_size = bdev->get_zone_size();
+ first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
+ bdev->reset_all_zones();
+ } else
+#endif
+ {
+ freelist_type = "bitmap";
+ }
+ dout(10) << " freelist_type " << freelist_type << dendl;
+
+ // choose min_alloc_size
+ dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
+ << " block_size: 0x" << block_size << std::dec << dendl;
+ if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
+ dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
+ << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ min_alloc_size = optimal_io_size;
+ }
+ else if (cct->_conf->bluestore_min_alloc_size) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+ } else {
+ ceph_assert(bdev);
+ if (_use_rotational_settings()) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+ } else {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+ }
+ }
+ _validate_bdev();
+
+ // make sure min_alloc_size is power of 2 aligned.
+ if (!std::has_single_bit(min_alloc_size)) {
+ derr << __func__ << " min_alloc_size 0x"
+ << std::hex << min_alloc_size << std::dec
+ << " is not power of 2 aligned!"
+ << dendl;
+ r = -EINVAL;
+ goto out_close_bdev;
+ }
+
+ // make sure min_alloc_size is >= and aligned with block size
+ if (min_alloc_size % block_size != 0) {
+ derr << __func__ << " min_alloc_size 0x"
+ << std::hex << min_alloc_size
+ << " is less or not aligned with block_size: 0x"
+ << block_size << std::dec << dendl;
+ r = -EINVAL;
+ goto out_close_bdev;
+ }
+
+ r = _create_alloc();
+ if (r < 0) {
+ goto out_close_bdev;
+ }
+
+ reserved = _get_ondisk_reserved();
+ alloc->init_add_free(reserved,
+ p2align(bdev->get_size(), min_alloc_size) - reserved);
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr() && alloc != shared_alloc.a) {
+ shared_alloc.a->init_add_free(reserved,
+ p2align(bdev->get_conventional_region_size(),
+ min_alloc_size) - reserved);
+ }
+#endif
+
+ r = _open_db(true);
+ if (r < 0)
+ goto out_close_alloc;
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ r = _open_fm(t, false, true);
+ if (r < 0)
+ goto out_close_db;
+ {
+ bufferlist bl;
+ encode((uint64_t)0, bl);
+ t->set(PREFIX_SUPER, "nid_max", bl);
+ t->set(PREFIX_SUPER, "blobid_max", bl);
+ }
+
+ {
+ bufferlist bl;
+ encode((uint64_t)min_alloc_size, bl);
+ t->set(PREFIX_SUPER, "min_alloc_size", bl);
+ }
+ {
+ bufferlist bl;
+ if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
+ bl.append(stringify(OMAP_BULK));
+ } else {
+ bl.append(stringify(OMAP_PER_PG));
+ }
+ t->set(PREFIX_SUPER, "per_pool_omap", bl);
+ }
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ {
+ bufferlist bl;
+ encode((uint64_t)zone_size, bl);
+ t->set(PREFIX_SUPER, "zone_size", bl);
+ }
+ {
+ bufferlist bl;
+ encode((uint64_t)first_sequential_zone, bl);
+ t->set(PREFIX_SUPER, "first_sequential_zone", bl);
+ }
+ }
+#endif
+
+ ondisk_format = latest_ondisk_format;
+ _prepare_ondisk_format_super(t);
+ db->submit_transaction_sync(t);
+ }
+
+ r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
+ if (r < 0)
+ goto out_close_fm;
+
+ r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
+ if (r < 0)
+ goto out_close_fm;
+
+ if (fsid != old_fsid) {
+ r = _write_fsid();
+ if (r < 0) {
+ derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+ goto out_close_fm;
+ }
+ }
+
+ out_close_fm:
+ _close_fm();
+ out_close_db:
+ _close_db();
+ out_close_alloc:
+ _close_alloc();
+ out_close_bdev:
+ _close_bdev();
+ out_close_fsid:
+ _close_fsid();
+ out_path_fd:
+ _close_path();
+
+ if (r == 0 &&
+ cct->_conf->bluestore_fsck_on_mkfs) {
+ int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+ if (rc < 0)
+ return rc;
+ if (rc > 0) {
+ derr << __func__ << " fsck found " << rc << " errors" << dendl;
+ r = -EIO;
+ }
+ }
+
+ if (r == 0) {
+ // indicate success by writing the 'mkfs_done' file
+ r = write_meta("mkfs_done", "yes");
+ }
+
+ if (r < 0) {
+ derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+ } else {
+ dout(0) << __func__ << " success" << dendl;
+ }
+ return r;
+}
+
+int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
+{
+ dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+ int r;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+ dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
+ r = _open_db_and_around(true);
+ if (r < 0) {
+ return r;
+ }
+
+ if (id == BlueFS::BDEV_NEWWAL) {
+ string p = path + "/block.wal";
+ r = _setup_block_symlink_or_file("block.wal", dev_path,
+ cct->_conf->bluestore_block_wal_size,
+ true);
+ ceph_assert(r == 0);
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
+ cct->_conf->bdev_enable_discard,
+ BDEV_LABEL_BLOCK_SIZE);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+ r = _check_or_set_bdev_label(
+ p,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+ "bluefs wal",
+ true);
+ ceph_assert(r == 0);
+ }
+
+ bluefs_layout.dedicated_wal = true;
+ } else if (id == BlueFS::BDEV_NEWDB) {
+ string p = path + "/block.db";
+ r = _setup_block_symlink_or_file("block.db", dev_path,
+ cct->_conf->bluestore_block_db_size,
+ true);
+ ceph_assert(r == 0);
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
+ cct->_conf->bdev_enable_discard,
+ SUPER_RESERVED);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+ r = _check_or_set_bdev_label(
+ p,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+ "bluefs db",
+ true);
+ ceph_assert(r == 0);
+ }
+ bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+ bluefs_layout.dedicated_db = true;
+ }
+ bluefs->umount();
+ bluefs->mount();
+
+ r = bluefs->prepare_new_device(id, bluefs_layout);
+ ceph_assert(r == 0);
+
+ if (r < 0) {
+ derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+ } else {
+ dout(0) << __func__ << " success" << dendl;
+ }
+
+ _close_db_and_around();
+ return r;
+}
+
+int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
+ int id)
+{
+ dout(10) << __func__ << " id:" << id << dendl;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ int r = _open_db_and_around(true);
+ if (r < 0) {
+ return r;
+ }
+ auto close_db = make_scope_guard([&] {
+ _close_db_and_around();
+ });
+ uint64_t used_space = 0;
+ for(auto src_id : devs_source) {
+ used_space += bluefs->get_used(src_id);
+ }
+ uint64_t target_free = bluefs->get_free(id);
+ if (target_free < used_space) {
+ derr << __func__
+ << " can't migrate, free space at target: " << target_free
+ << " is less than required space: " << used_space
+ << dendl;
+ return -ENOSPC;
+ }
+ if (devs_source.count(BlueFS::BDEV_DB)) {
+ bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+ bluefs_layout.dedicated_db = false;
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ bluefs_layout.dedicated_wal = false;
+ }
+ r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
+ if (r < 0) {
+ derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (devs_source.count(BlueFS::BDEV_DB)) {
+ r = unlink(string(path + "/block.db").c_str());
+ ceph_assert(r == 0);
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ r = unlink(string(path + "/block.wal").c_str());
+ ceph_assert(r == 0);
+ }
+ return r;
+}
+
+int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
+ int id,
+ const string& dev_path)
+{
+ dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ int r = _open_db_and_around(true);
+ if (r < 0) {
+ return r;
+ }
+ auto close_db = make_scope_guard([&] {
+ _close_db_and_around();
+ });
+
+ string link_db;
+ string link_wal;
+ if (devs_source.count(BlueFS::BDEV_DB) &&
+ bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
+ link_db = path + "/block.db";
+ bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+ bluefs_layout.dedicated_db = false;
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ link_wal = path + "/block.wal";
+ bluefs_layout.dedicated_wal = false;
+ }
+
+ size_t target_size = 0;
+ string target_name;
+ if (id == BlueFS::BDEV_NEWWAL) {
+ target_name = "block.wal";
+ target_size = cct->_conf->bluestore_block_wal_size;
+ bluefs_layout.dedicated_wal = true;
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
+ cct->_conf->bdev_enable_discard,
+ BDEV_LABEL_BLOCK_SIZE);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+ r = _check_or_set_bdev_label(
+ dev_path,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+ "bluefs wal",
+ true);
+ ceph_assert(r == 0);
+ }
+ } else if (id == BlueFS::BDEV_NEWDB) {
+ target_name = "block.db";
+ target_size = cct->_conf->bluestore_block_db_size;
+ bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+ bluefs_layout.dedicated_db = true;
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
+ cct->_conf->bdev_enable_discard,
+ SUPER_RESERVED);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+ r = _check_or_set_bdev_label(
+ dev_path,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+ "bluefs db",
+ true);
+ ceph_assert(r == 0);
+ }
+ }
+
+ bluefs->umount();
+ bluefs->mount();
+
+ r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
+
+ if (r < 0) {
+ derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (!link_db.empty()) {
+ r = unlink(link_db.c_str());
+ ceph_assert(r == 0);
+ }
+ if (!link_wal.empty()) {
+ r = unlink(link_wal.c_str());
+ ceph_assert(r == 0);
+ }
+ r = _setup_block_symlink_or_file(
+ target_name,
+ dev_path,
+ target_size,
+ true);
+ ceph_assert(r == 0);
+ dout(0) << __func__ << " success" << dendl;
+
+ return r;
+}
+
+string BlueStore::get_device_path(unsigned id)
+{
+ string res;
+ if (id < BlueFS::MAX_BDEV) {
+ switch (id) {
+ case BlueFS::BDEV_WAL:
+ res = path + "/block.wal";
+ break;
+ case BlueFS::BDEV_DB:
+ if (id == bluefs_layout.shared_bdev) {
+ res = path + "/block";
+ } else {
+ res = path + "/block.db";
+ }
+ break;
+ case BlueFS::BDEV_SLOW:
+ res = path + "/block";
+ break;
+ }
+ }
+ return res;
+}
+
+int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
+{
+ bluestore_bdev_label_t label;
+ int r = _read_bdev_label(cct, path, &label);
+ if (r < 0) {
+ derr << "unable to read label for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ } else {
+ label.size = size;
+ r = _write_bdev_label(cct, path, label);
+ if (r < 0) {
+ derr << "unable to write label for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+ return r;
+}
+
+int BlueStore::expand_devices(ostream& out)
+{
+ int r = _open_db_and_around(true);
+ ceph_assert(r == 0);
+ bluefs->dump_block_extents(out);
+ out << "Expanding DB/WAL..." << std::endl;
+ for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
+ if (devid == bluefs_layout.shared_bdev ) {
+ continue;
+ }
+ uint64_t size = bluefs->get_block_device_size(devid);
+ if (size == 0) {
+ // no bdev
+ continue;
+ }
+
+ out << devid
+ <<" : expanding " << " to 0x" << size << std::dec << std::endl;
+ string p = get_device_path(devid);
+ const char* path = p.c_str();
+ if (path == nullptr) {
+ derr << devid
+ <<": can't find device path " << dendl;
+ continue;
+ }
+ if (bluefs->bdev_support_label(devid)) {
+ if (_set_bdev_label_size(p, size) >= 0) {
+ out << devid
+ << " : size label updated to " << size
+ << std::endl;
+ }
+ }
+ }
+ uint64_t size0 = fm->get_size();
+ uint64_t size = bdev->get_size();
+ if (size0 < size) {
+ out << bluefs_layout.shared_bdev
+ << " : expanding " << " from 0x" << std::hex
+ << size0 << " to 0x" << size << std::dec << std::endl;
+ _write_out_fm_meta(size);
+ if (bdev->supported_bdev_label()) {
+ if (_set_bdev_label_size(path, size) >= 0) {
+ out << bluefs_layout.shared_bdev
+ << " : size label updated to " << size
+ << std::endl;
+ }
+ }
+
+ if (fm && fm->is_null_manager()) {
+ // we grow the allocation range, must reflect it in the allocation file
+ alloc->init_add_free(size0, size - size0);
+ need_to_destage_allocation_file = true;
+ }
+ _close_db_and_around();
+
+ // mount in read/write to sync expansion changes
+ r = _mount();
+ ceph_assert(r == 0);
+ umount();
+ } else {
+ _close_db_and_around();
+ }
+ return r;
+}
+
+int BlueStore::dump_bluefs_sizes(ostream& out)
+{
+ int r = _open_db_and_around(true);
+ ceph_assert(r == 0);
+ bluefs->dump_block_extents(out);
+ _close_db_and_around();
+ return r;
+}
+
+void BlueStore::set_cache_shards(unsigned num)
+{
+ dout(10) << __func__ << " " << num << dendl;
+ size_t oold = onode_cache_shards.size();
+ size_t bold = buffer_cache_shards.size();
+ ceph_assert(num >= oold && num >= bold);
+ onode_cache_shards.resize(num);
+ buffer_cache_shards.resize(num);
+ for (unsigned i = oold; i < num; ++i) {
+ onode_cache_shards[i] =
+ OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+ logger);
+ }
+ for (unsigned i = bold; i < num; ++i) {
+ buffer_cache_shards[i] =
+ BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+ logger);
+ }
+}
+
+//---------------------------------------------
+bool BlueStore::has_null_manager() const
+{
+ return (fm && fm->is_null_manager());
+}
+
+int BlueStore::_mount()
+{
+ dout(5) << __func__ << "NCB:: path " << path << dendl;
+
+ _kv_only = false;
+ if (cct->_conf->bluestore_fsck_on_mount) {
+ dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
+ int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
+ if (rc < 0)
+ return rc;
+ if (rc > 0) {
+ derr << __func__ << " fsck found " << rc << " errors" << dendl;
+ return -EIO;
+ }
+ }
+
+ if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+ derr << __func__ << " osd_max_object_size "
+ << cct->_conf->osd_max_object_size << " > bluestore max "
+ << OBJECT_MAX_SIZE << dendl;
+ return -EINVAL;
+ }
+
+ dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
+ int r = _open_db_and_around(false);
+ if (r < 0) {
+ return r;
+ }
+ auto close_db = make_scope_guard([&] {
+ if (!mounted) {
+ _close_db_and_around();
+ }
+ });
+
+ r = _upgrade_super();
+ if (r < 0) {
+ return r;
+ }
+
+ // The recovery process for allocation-map needs to open collection early
+ r = _open_collections();
+ if (r < 0) {
+ return r;
+ }
+ auto shutdown_cache = make_scope_guard([&] {
+ if (!mounted) {
+ _shutdown_cache();
+ }
+ });
+
+ r = _reload_logger();
+ if (r < 0) {
+ return r;
+ }
+
+ _kv_start();
+ auto stop_kv = make_scope_guard([&] {
+ if (!mounted) {
+ _kv_stop();
+ }
+ });
+
+ r = _deferred_replay();
+ if (r < 0) {
+ return r;
+ }
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ _zoned_cleaner_start();
+ }
+#endif
+
+ mempool_thread.init();
+
+ if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
+ cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
+
+ auto was_per_pool_omap = per_pool_omap;
+
+ dout(1) << __func__ << " quick-fix on mount" << dendl;
+ _fsck_on_open(FSCK_SHALLOW, true);
+
+ //set again as hopefully it has been fixed
+ if (was_per_pool_omap != OMAP_PER_PG) {
+ _set_per_pool_omap();
+ }
+ }
+
+ mounted = true;
+ return 0;
+}
+
+int BlueStore::umount()
+{
+ ceph_assert(_kv_only || mounted);
+ _osr_drain_all();
+
+ mounted = false;
+
+ ceph_assert(alloc);
+
+ if (!_kv_only) {
+ mempool_thread.shutdown();
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
+ _zoned_cleaner_stop();
+ }
+#endif
+ dout(20) << __func__ << " stopping kv thread" << dendl;
+ _kv_stop();
+ // skip cache cleanup step on fast shutdown
+ if (likely(!m_fast_shutdown)) {
+ _shutdown_cache();
+ }
+ dout(20) << __func__ << " closing" << dendl;
+ }
+ _close_db_and_around();
+ // disable fsck on fast-shutdown
+ if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
+ int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
+ if (rc < 0)
+ return rc;
+ if (rc > 0) {
+ derr << __func__ << " fsck found " << rc << " errors" << dendl;
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+int BlueStore::cold_open()
+{
+ return _open_db_and_around(true);
+}
+
+int BlueStore::cold_close()
+{
+ _close_db_and_around();
+ return 0;
+}
+
+// derr wrapper to limit enormous output and avoid log flooding.
+// Of limited use where such output is expected for now
+#define fsck_derr(err_cnt, threshold) \
+ if (err_cnt <= threshold) { \
+ bool need_skip_print = err_cnt == threshold; \
+ derr
+
+#define fsck_dendl \
+ dendl; \
+ if (need_skip_print) \
+ derr << "more error lines skipped..." << dendl; \
+ }
+
+int _fsck_sum_extents(
+ const PExtentVector& extents,
+ bool compressed,
+ store_statfs_t& expected_statfs)
+{
+ for (auto e : extents) {
+ if (!e.is_valid())
+ continue;
+ expected_statfs.allocated += e.length;
+ if (compressed) {
+ expected_statfs.data_compressed_allocated += e.length;
+ }
+ }
+ return 0;
+}
+
+int BlueStore::_fsck_check_extents(
+ std::string_view ctx_descr,
+ const PExtentVector& extents,
+ bool compressed,
+ mempool_dynamic_bitset &used_blocks,
+ uint64_t granularity,
+ BlueStoreRepairer* repairer,
+ store_statfs_t& expected_statfs,
+ FSCKDepth depth)
+{
+ dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
+ int errors = 0;
+ for (auto e : extents) {
+ if (!e.is_valid())
+ continue;
+ expected_statfs.allocated += e.length;
+ if (compressed) {
+ expected_statfs.data_compressed_allocated += e.length;
+ }
+ if (depth != FSCK_SHALLOW) {
+ bool already = false;
+ apply_for_bitset_range(
+ e.offset, e.length, granularity, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ if (bs.test(pos)) {
+ if (repairer) {
+ repairer->note_misreference(
+ pos * min_alloc_size, min_alloc_size, !already);
+ }
+ if (!already) {
+ derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
+ << " or a subset is already allocated (misreferenced)" << dendl;
+ ++errors;
+ already = true;
+ }
+ }
+ else
+ bs.set(pos);
+ });
+
+ if (e.end() > bdev->get_size()) {
+ derr << "fsck error: " << ctx_descr << ", extent " << e
+ << " past end of block device" << dendl;
+ ++errors;
+ }
+ }
+ }
+ return errors;
+}
+
+void BlueStore::_fsck_check_statfs(
+ const store_statfs_t& expected_statfs,
+ const per_pool_statfs& expected_pool_statfs,
+ int64_t& errors,
+ int64_t& warnings,
+ BlueStoreRepairer* repairer)
+{
+ string key;
+ store_statfs_t actual_statfs;
+ store_statfs_t s;
+ {
+ // make a copy
+ per_pool_statfs my_expected_pool_statfs(expected_pool_statfs);
+ auto op = osd_pools.begin();
+ while (op != osd_pools.end()) {
+ get_pool_stat_key(op->first, &key);
+ op->second.publish(&s);
+ auto it_expected = my_expected_pool_statfs.find(op->first);
+ if (it_expected == my_expected_pool_statfs.end()) {
+ auto op0 = op++;
+ if (op0->second.is_empty()) {
+ // It's OK to lack relevant empty statfs record
+ continue;
+ }
+ derr << __func__ << "::fsck error: " << std::hex
+ << "pool " << op0->first << " has got no statfs to match against: "
+ << s
+ << std::dec << dendl;
+ ++errors;
+ if (repairer) {
+ osd_pools.erase(op0);
+ repairer->remove_key(db, PREFIX_STAT, key);
+ }
+ } else {
+ if (!(s == it_expected->second)) {
+ derr << "fsck error: actual " << s
+ << " != expected " << it_expected->second
+ << " for pool "
+ << std::hex << op->first << std::dec << dendl;
+ ++errors;
+ if (repairer) {
+ // repair in-memory in a hope this would be flushed properly on shutdown
+ s = it_expected->second;
+ op->second = it_expected->second;
+ repairer->fix_statfs(db, key, it_expected->second);
+ }
+ }
+ actual_statfs.add(s);
+ my_expected_pool_statfs.erase(it_expected);
+ ++op;
+ }
+ }
+ // check stats that lack matching entities in osd_pools
+ for (auto &p : my_expected_pool_statfs) {
+ if (p.second.is_zero()) {
+ // It's OK to lack relevant empty statfs record
+ continue;
+ }
+ get_pool_stat_key(p.first, &key);
+ derr << __func__ << "::fsck error: " << std::hex
+ << "pool " << p.first << " has got no actual statfs: "
+ << std::dec << p.second
+ << dendl;
+ ++errors;
+ if (repairer) {
+ osd_pools[p.first] = p.second;
+ repairer->fix_statfs(db, key, p.second);
+ actual_statfs.add(p.second);
+ }
+ }
+ }
+ // process global statfs
+ if (repairer) {
+ if (!per_pool_stat_collection) {
+ // by virtue of running this method, we correct the top-level
+ // error of having global stats
+ repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
+ per_pool_stat_collection = true;
+ }
+ vstatfs = actual_statfs;
+ dout(20) << __func__ << " setting vstatfs to " << actual_statfs << dendl;
+ } else if (!per_pool_stat_collection) {
+ // check global stats only if fscking (not repairing) w/o per-pool stats
+ vstatfs.publish(&s);
+ if (!(s == expected_statfs)) {
+ derr << "fsck error: actual " << s
+ << " != expected " << expected_statfs << dendl;
+ ++errors;
+ }
+ }
+}
+
+void BlueStore::_fsck_repair_shared_blobs(
+ BlueStoreRepairer& repairer,
+ shared_blob_2hash_tracker_t& sb_ref_counts,
+ sb_info_space_efficient_map_t& sb_info)
+{
+ auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
+ dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
+ << sb_ref_mismatches << dendl;
+ if (!sb_ref_mismatches) // not expected to succeed, just in case
+ return;
+
+
+ auto foreach_shared_blob = [&](std::function<
+ void (coll_t,
+ ghobject_t,
+ uint64_t,
+ const bluestore_blob_t&)> cb) {
+ auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ CollectionRef c;
+ spg_t pgid;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key())
+ << dendl;
+ if (is_extent_shard_key(it->key())) {
+ continue;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0) {
+ continue;
+ }
+
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
+ }
+ }
+ if (!c) {
+ continue;
+ }
+ }
+ dout(20) << __func__
+ << " inspecting shared blob refs for col:" << c->cid
+ << " obj:" << oid
+ << dendl;
+
+ OnodeRef o;
+ o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+ _dump_onode<30>(cct, *o);
+
+ mempool::bluestore_fsck::set<BlobRef> passed_sbs;
+ for (auto& e : o->extent_map.extent_map) {
+ auto& b = e.blob->get_blob();
+ if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
+ auto sbid = e.blob->shared_blob->get_sbid();
+ cb(c->cid, oid, sbid, b);
+ passed_sbs.emplace(e.blob);
+ }
+ } // for ... extent_map
+ } // for ... it->valid
+ } //if (it(PREFIX_OBJ))
+ }; //foreach_shared_blob fn declaration
+
+ mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
+
+ // first iteration over objects to identify all the broken sbids
+ foreach_shared_blob( [&](coll_t cid,
+ ghobject_t oid,
+ uint64_t sbid,
+ const bluestore_blob_t& b) {
+ auto it = refs_map.lower_bound(sbid);
+ if(it != refs_map.end() && it->first == sbid) {
+ return;
+ }
+ for (auto& p : b.get_extents()) {
+ if (p.is_valid() &&
+ !sb_ref_counts.test_all_zero_range(sbid,
+ p.offset,
+ p.length)) {
+ refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
+ dout(20) << __func__
+ << " broken shared blob found for col:" << cid
+ << " obj:" << oid
+ << " sbid 0x " << std::hex << sbid << std::dec
+ << dendl;
+ break;
+ }
+ }
+ });
+
+ // second iteration over objects to build new ref map for the broken sbids
+ foreach_shared_blob( [&](coll_t cid,
+ ghobject_t oid,
+ uint64_t sbid,
+ const bluestore_blob_t& b) {
+ auto it = refs_map.find(sbid);
+ if(it == refs_map.end()) {
+ return;
+ }
+ for (auto& p : b.get_extents()) {
+ if (p.is_valid()) {
+ it->second.get(p.offset, p.length);
+ break;
+ }
+ }
+ });
+
+ // update shared blob records
+ auto ref_it = refs_map.begin();
+ while (ref_it != refs_map.end()) {
+ size_t cnt = 0;
+ const size_t max_transactions = 4096;
+ KeyValueDB::Transaction txn = db->get_transaction();
+ for (cnt = 0;
+ cnt < max_transactions && ref_it != refs_map.end();
+ ref_it++) {
+ auto sbid = ref_it->first;
+ dout(20) << __func__ << " repaired shared_blob 0x"
+ << std::hex << sbid << std::dec
+ << ref_it->second << dendl;
+ repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
+ cnt++;
+ }
+ if (cnt) {
+ db->submit_transaction_sync(txn);
+ cnt = 0;
+ }
+ }
+ // remove stray shared blob records
+ size_t cnt = 0;
+ const size_t max_transactions = 4096;
+ KeyValueDB::Transaction txn = db->get_transaction();
+ sb_info.foreach_stray([&](const sb_info_t& sbi) {
+ auto sbid = sbi.get_sbid();
+ dout(20) << __func__ << " removing stray shared_blob 0x"
+ << std::hex << sbid << std::dec
+ << dendl;
+ repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+ cnt++;
+ if (cnt >= max_transactions) {}
+ db->submit_transaction_sync(txn);
+ txn = db->get_transaction();
+ cnt = 0;
+ });
+ if (cnt > 0) {
+ db->submit_transaction_sync(txn);
+ }
+
+ // amount of repairs to report to be equal to previously
+ // determined error estimation, not the actual number of updated shared blobs
+ repairer.inc_repaired(sb_ref_mismatches);
+}
+
+BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
+ BlueStore::FSCKDepth depth,
+ int64_t pool_id,
+ BlueStore::CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& value,
+ mempool::bluestore_fsck::list<string>* expecting_shards,
+ map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+ const BlueStore::FSCK_ObjectCtx& ctx)
+{
+ auto& errors = ctx.errors;
+ auto& num_objects = ctx.num_objects;
+ auto& num_extents = ctx.num_extents;
+ auto& num_blobs = ctx.num_blobs;
+ auto& num_sharded_objects = ctx.num_sharded_objects;
+ auto& num_spanning_blobs = ctx.num_spanning_blobs;
+ auto used_blocks = ctx.used_blocks;
+ auto sb_info_lock = ctx.sb_info_lock;
+ auto& sb_info = ctx.sb_info;
+ auto& sb_ref_counts = ctx.sb_ref_counts;
+ auto repairer = ctx.repairer;
+
+ store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
+ &ctx.expected_pool_statfs[pool_id] :
+ &ctx.expected_store_statfs;
+
+ map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
+
+ dout(10) << __func__ << " " << oid << dendl;
+ OnodeRef o;
+ o.reset(Onode::create_decode(c, oid, key, value));
+ ++num_objects;
+
+ num_spanning_blobs += o->extent_map.spanning_blob_map.size();
+
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ _dump_onode<30>(cct, *o);
+ // shards
+ if (!o->extent_map.shards.empty()) {
+ ++num_sharded_objects;
+ if (depth != FSCK_SHALLOW) {
+ ceph_assert(expecting_shards);
+ for (auto& s : o->extent_map.shards) {
+ dout(20) << __func__ << " shard " << *s.shard_info << dendl;
+ expecting_shards->push_back(string());
+ get_extent_shard_key(o->key, s.shard_info->offset,
+ &expecting_shards->back());
+ if (s.shard_info->offset >= o->onode.size) {
+ derr << "fsck error: " << oid << " shard 0x" << std::hex
+ << s.shard_info->offset << " past EOF at 0x" << o->onode.size
+ << std::dec << dendl;
+ ++errors;
+ }
+ }
+ }
+ }
+
+ // lextents
+ uint64_t pos = 0;
+ mempool::bluestore_fsck::map<BlobRef,
+ bluestore_blob_use_tracker_t> ref_map;
+ for (auto& l : o->extent_map.extent_map) {
+ dout(20) << __func__ << " " << l << dendl;
+ if (l.logical_offset < pos) {
+ derr << "fsck error: " << oid << " lextent at 0x"
+ << std::hex << l.logical_offset
+ << " overlaps with the previous, which ends at 0x" << pos
+ << std::dec << dendl;
+ ++errors;
+ }
+ if (depth != FSCK_SHALLOW &&
+ o->extent_map.spans_shard(l.logical_offset, l.length)) {
+ derr << "fsck error: " << oid << " lextent at 0x"
+ << std::hex << l.logical_offset << "~" << l.length
+ << " spans a shard boundary"
+ << std::dec << dendl;
+ ++errors;
+ }
+ pos = l.logical_offset + l.length;
+ res_statfs->data_stored += l.length;
+ ceph_assert(l.blob);
+ const bluestore_blob_t& blob = l.blob->get_blob();
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr() && depth != FSCK_SHALLOW) {
+ for (auto& e : blob.get_extents()) {
+ if (e.is_valid()) {
+ uint32_t zone = e.offset / zone_size;
+ uint64_t offset = e.offset % zone_size;
+ auto p = zone_first_offsets.find(zone);
+ if (p == zone_first_offsets.end() || p->second > offset) {
+ // FIXME: use interator for guided insert?
+ zone_first_offsets[zone] = offset;
+ }
+ }
+ }
+ }
+#endif
+
+ auto& ref = ref_map[l.blob];
+ if (ref.is_empty()) {
+ uint32_t min_release_size = blob.get_release_size(min_alloc_size);
+ uint32_t l = blob.get_logical_length();
+ ref.init(l, min_release_size);
+ }
+ ref.get(
+ l.blob_offset,
+ l.length);
+ ++num_extents;
+ if (depth != FSCK_SHALLOW &&
+ blob.has_unused()) {
+ ceph_assert(referenced);
+ auto p = referenced->find(l.blob);
+ bluestore_blob_t::unused_t* pu;
+ if (p == referenced->end()) {
+ pu = &(*referenced)[l.blob];
+ }
+ else {
+ pu = &p->second;
+ }
+ uint64_t blob_len = blob.get_logical_length();
+ ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
+ ceph_assert(l.blob_offset + l.length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
+ uint64_t start = l.blob_offset / chunk_size;
+ uint64_t end =
+ round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
+ for (auto i = start; i < end; ++i) {
+ (*pu) |= (1u << i);
+ }
+ }
+ } //for (auto& l : o->extent_map.extent_map)
+
+ for (auto& i : ref_map) {
+ ++num_blobs;
+ const bluestore_blob_t& blob = i.first->get_blob();
+ bool equal =
+ depth == FSCK_SHALLOW ? true :
+ i.first->get_blob_use_tracker().equal(i.second);
+ if (!equal) {
+ derr << "fsck error: " << oid << " blob " << *i.first
+ << " doesn't match expected ref_map " << i.second << dendl;
+ ++errors;
+ }
+ if (blob.is_compressed()) {
+ res_statfs->data_compressed += blob.get_compressed_payload_length();
+ res_statfs->data_compressed_original +=
+ i.first->get_referenced_bytes();
+ }
+ if (depth != FSCK_SHALLOW && repairer) {
+ for (auto e : blob.get_extents()) {
+ if (!e.is_valid())
+ continue;
+ repairer->set_space_used(e.offset, e.length, c->cid, oid);
+ }
+ }
+ if (blob.is_shared()) {
+ if (i.first->shared_blob->get_sbid() > blobid_max) {
+ derr << "fsck error: " << oid << " blob " << blob
+ << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
+ << blobid_max << dendl;
+ ++errors;
+ } else if (i.first->shared_blob->get_sbid() == 0) {
+ derr << "fsck error: " << oid << " blob " << blob
+ << " marked as shared but has uninitialized sbid"
+ << dendl;
+ ++errors;
+ }
+ // the below lock is optional and provided in multithreading mode only
+ if (sb_info_lock) {
+ sb_info_lock->lock();
+ }
+ auto sbid = i.first->shared_blob->get_sbid();
+ sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
+ ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
+ sbi.pool_id == oid.hobj.get_logical_pool());
+ sbi.pool_id = oid.hobj.get_logical_pool();
+ bool compressed = blob.is_compressed();
+ for (auto e : blob.get_extents()) {
+ if (e.is_valid()) {
+ if (compressed) {
+ ceph_assert(sbi.allocated_chunks <= 0);
+ sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
+ } else {
+ ceph_assert(sbi.allocated_chunks >= 0);
+ sbi.allocated_chunks += (e.length >> min_alloc_size_order);
+ }
+ sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
+ }
+ }
+ if (sb_info_lock) {
+ sb_info_lock->unlock();
+ }
+ } else if (depth != FSCK_SHALLOW) {
+ ceph_assert(used_blocks);
+ string ctx_descr = " oid " + stringify(oid);
+ errors += _fsck_check_extents(ctx_descr,
+ blob.get_extents(),
+ blob.is_compressed(),
+ *used_blocks,
+ fm->get_alloc_size(),
+ repairer,
+ *res_statfs,
+ depth);
+ } else {
+ errors += _fsck_sum_extents(
+ blob.get_extents(),
+ blob.is_compressed(),
+ *res_statfs);
+ }
+ } // for (auto& i : ref_map)
+
+ {
+ auto &sbm = o->extent_map.spanning_blob_map;
+ size_t broken = 0;
+ BlobRef first_broken;
+ for (auto it = sbm.begin(); it != sbm.end();) {
+ auto it1 = it++;
+ if (ref_map.count(it1->second) == 0) {
+ if (!broken) {
+ first_broken = it1->second;
+ ++errors;
+ derr << "fsck error:" << " stray spanning blob found:" << it1->first
+ << dendl;
+ }
+ broken++;
+ if (repairer) {
+ sbm.erase(it1);
+ }
+ }
+ }
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr() && depth != FSCK_SHALLOW) {
+ for (auto& [zone, first_offset] : zone_first_offsets) {
+ auto p = (*ctx.zone_refs)[zone].find(oid);
+ if (p != (*ctx.zone_refs)[zone].end()) {
+ if (first_offset < p->second) {
+ dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
+ << " offset 0x" << p->second
+ << " but first offset is 0x" << first_offset
+ << "; this can happen due to clone_range"
+ << dendl;
+ } else {
+ dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
+ << " <= first offset 0x" << first_offset
+ << std::dec << dendl;
+ }
+ (*ctx.zone_refs)[zone].erase(p);
+ } else {
+ derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
+ << " but there is no zone ref" << std::dec << dendl;
+ // FIXME: add repair
+ ++errors;
+ }
+ }
+ }
+#endif
+
+ if (broken) {
+ derr << "fsck error: " << oid << " - " << broken
+ << " zombie spanning blob(s) found, the first one: "
+ << *first_broken << dendl;
+ if(repairer) {
+ repairer->fix_spanning_blobs(
+ db,
+ [&](KeyValueDB::Transaction txn) {
+ _record_onode(o, txn);
+ });
+ }
+ }
+ }
+
+ if (o->onode.has_omap()) {
+ _fsck_check_object_omap(depth, o, ctx);
+ }
+
+ return o;
+}
+
+#include "common/WorkQueue.h"
+
+class ShallowFSCKThreadPool : public ThreadPool
+{
+public:
+ ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
+ ThreadPool(cct_, nm, tn, n) {
+ }
+ void worker(ThreadPool::WorkThread* wt) override {
+ int next_wq = 0;
+ while (!_stop) {
+ next_wq %= work_queues.size();
+ WorkQueue_ *wq = work_queues[next_wq++];
+
+ void* item = wq->_void_dequeue();
+ if (item) {
+ processing++;
+ TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
+ wq->_void_process(item, tp_handle);
+ processing--;
+ }
+ }
+ }
+ template <size_t BatchLen>
+ struct FSCKWorkQueue : public ThreadPool::WorkQueue_
+ {
+ struct Entry {
+ int64_t pool_id;
+ BlueStore::CollectionRef c;
+ ghobject_t oid;
+ string key;
+ bufferlist value;
+ };
+ struct Batch {
+ std::atomic<size_t> running = { 0 };
+ size_t entry_count = 0;
+ std::array<Entry, BatchLen> entries;
+
+ int64_t errors = 0;
+ int64_t warnings = 0;
+ uint64_t num_objects = 0;
+ uint64_t num_extents = 0;
+ uint64_t num_blobs = 0;
+ uint64_t num_sharded_objects = 0;
+ uint64_t num_spanning_blobs = 0;
+ store_statfs_t expected_store_statfs;
+ BlueStore::per_pool_statfs expected_pool_statfs;
+ };
+
+ size_t batchCount;
+ BlueStore* store = nullptr;
+
+ ceph::mutex* sb_info_lock = nullptr;
+ sb_info_space_efficient_map_t* sb_info = nullptr;
+ shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
+ BlueStoreRepairer* repairer = nullptr;
+
+ Batch* batches = nullptr;
+ size_t last_batch_pos = 0;
+ bool batch_acquired = false;
+
+ FSCKWorkQueue(std::string n,
+ size_t _batchCount,
+ BlueStore* _store,
+ ceph::mutex* _sb_info_lock,
+ sb_info_space_efficient_map_t& _sb_info,
+ shared_blob_2hash_tracker_t& _sb_ref_counts,
+ BlueStoreRepairer* _repairer) :
+ WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
+ batchCount(_batchCount),
+ store(_store),
+ sb_info_lock(_sb_info_lock),
+ sb_info(&_sb_info),
+ sb_ref_counts(&_sb_ref_counts),
+ repairer(_repairer)
+ {
+ batches = new Batch[batchCount];
+ }
+ ~FSCKWorkQueue() {
+ delete[] batches;
+ }
+
+ /// Remove all work items from the queue.
+ void _clear() override {
+ //do nothing
+ }
+ /// Check whether there is anything to do.
+ bool _empty() override {
+ ceph_assert(false);
+ }
+
+ /// Get the next work item to process.
+ void* _void_dequeue() override {
+ size_t pos = rand() % batchCount;
+ size_t pos0 = pos;
+ do {
+ auto& batch = batches[pos];
+ if (batch.running.fetch_add(1) == 0) {
+ if (batch.entry_count) {
+ return &batch;
+ }
+ }
+ batch.running--;
+ pos++;
+ pos %= batchCount;
+ } while (pos != pos0);
+ return nullptr;
+ }
+ /** @brief Process the work item.
+ * This function will be called several times in parallel
+ * and must therefore be thread-safe. */
+ void _void_process(void* item, TPHandle& handle) override {
+ Batch* batch = (Batch*)item;
+
+ BlueStore::FSCK_ObjectCtx ctx(
+ batch->errors,
+ batch->warnings,
+ batch->num_objects,
+ batch->num_extents,
+ batch->num_blobs,
+ batch->num_sharded_objects,
+ batch->num_spanning_blobs,
+ nullptr, // used_blocks
+ nullptr, //used_omap_head
+ nullptr,
+ sb_info_lock,
+ *sb_info,
+ *sb_ref_counts,
+ batch->expected_store_statfs,
+ batch->expected_pool_statfs,
+ repairer);
+
+ for (size_t i = 0; i < batch->entry_count; i++) {
+ auto& entry = batch->entries[i];
+
+ store->fsck_check_objects_shallow(
+ BlueStore::FSCK_SHALLOW,
+ entry.pool_id,
+ entry.c,
+ entry.oid,
+ entry.key,
+ entry.value,
+ nullptr, // expecting_shards - this will need a protection if passed
+ nullptr, // referenced
+ ctx);
+ }
+ batch->entry_count = 0;
+ batch->running--;
+ }
+ /** @brief Synchronously finish processing a work item.
+ * This function is called after _void_process with the global thread pool lock held,
+ * so at most one copy will execute simultaneously for a given thread pool.
+ * It can be used for non-thread-safe finalization. */
+ void _void_process_finish(void*) override {
+ ceph_assert(false);
+ }
+
+ bool queue(
+ int64_t pool_id,
+ BlueStore::CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& value) {
+ bool res = false;
+ size_t pos0 = last_batch_pos;
+ if (!batch_acquired) {
+ do {
+ auto& batch = batches[last_batch_pos];
+ if (batch.running.fetch_add(1) == 0) {
+ if (batch.entry_count < BatchLen) {
+ batch_acquired = true;
+ break;
+ }
+ }
+ batch.running.fetch_sub(1);
+ last_batch_pos++;
+ last_batch_pos %= batchCount;
+ } while (last_batch_pos != pos0);
+ }
+ if (batch_acquired) {
+ auto& batch = batches[last_batch_pos];
+ ceph_assert(batch.running);
+ ceph_assert(batch.entry_count < BatchLen);
+
+ auto& entry = batch.entries[batch.entry_count];
+ entry.pool_id = pool_id;
+ entry.c = c;
+ entry.oid = oid;
+ entry.key = key;
+ entry.value = value;
+
+ ++batch.entry_count;
+ if (batch.entry_count == BatchLen) {
+ batch_acquired = false;
+ batch.running.fetch_sub(1);
+ last_batch_pos++;
+ last_batch_pos %= batchCount;
+ }
+ res = true;
+ }
+ return res;
+ }
+
+ void finalize(ThreadPool& tp,
+ BlueStore::FSCK_ObjectCtx& ctx) {
+ if (batch_acquired) {
+ auto& batch = batches[last_batch_pos];
+ ceph_assert(batch.running);
+ batch.running.fetch_sub(1);
+ }
+ tp.stop();
+
+ for (size_t i = 0; i < batchCount; i++) {
+ auto& batch = batches[i];
+
+ //process leftovers if any
+ if (batch.entry_count) {
+ TPHandle tp_handle(store->cct,
+ nullptr,
+ timeout_interval,
+ suicide_interval);
+ ceph_assert(batch.running == 0);
+
+ batch.running++; // just to be on-par with the regular call
+ _void_process(&batch, tp_handle);
+ }
+ ceph_assert(batch.entry_count == 0);
+
+ ctx.errors += batch.errors;
+ ctx.warnings += batch.warnings;
+ ctx.num_objects += batch.num_objects;
+ ctx.num_extents += batch.num_extents;
+ ctx.num_blobs += batch.num_blobs;
+ ctx.num_sharded_objects += batch.num_sharded_objects;
+ ctx.num_spanning_blobs += batch.num_spanning_blobs;
+
+ ctx.expected_store_statfs.add(batch.expected_store_statfs);
+
+ for (auto it = batch.expected_pool_statfs.begin();
+ it != batch.expected_pool_statfs.end();
+ it++) {
+ ctx.expected_pool_statfs[it->first].add(it->second);
+ }
+ }
+ }
+ };
+};
+
+void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
+ OnodeRef& o,
+ const BlueStore::FSCK_ObjectCtx& ctx)
+{
+ auto& errors = ctx.errors;
+ auto& warnings = ctx.warnings;
+ auto repairer = ctx.repairer;
+
+ ceph_assert(o->onode.has_omap());
+ if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
+ if (per_pool_omap == OMAP_PER_POOL) {
+ fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+ << "fsck error: " << o->oid
+ << " has omap that is not per-pool or pgmeta"
+ << fsck_dendl;
+ ++errors;
+ } else {
+ const char* w;
+ int64_t num;
+ if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
+ ++errors;
+ num = errors;
+ w = "error";
+ } else {
+ ++warnings;
+ num = warnings;
+ w = "warning";
+ }
+ fsck_derr(num, MAX_FSCK_ERROR_LINES)
+ << "fsck " << w << ": " << o->oid
+ << " has omap that is not per-pool or pgmeta"
+ << fsck_dendl;
+ }
+ } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
+ if (per_pool_omap == OMAP_PER_PG) {
+ fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+ << "fsck error: " << o->oid
+ << " has omap that is not per-pg or pgmeta"
+ << fsck_dendl;
+ ++errors;
+ } else {
+ const char* w;
+ int64_t num;
+ if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
+ ++errors;
+ num = errors;
+ w = "error";
+ } else {
+ ++warnings;
+ num = warnings;
+ w = "warning";
+ }
+ fsck_derr(num, MAX_FSCK_ERROR_LINES)
+ << "fsck " << w << ": " << o->oid
+ << " has omap that is not per-pg or pgmeta"
+ << fsck_dendl;
+ }
+ }
+ if (repairer &&
+ !o->onode.is_perpg_omap() &&
+ !o->onode.is_pgmeta_omap()) {
+ dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
+ bufferlist header;
+ map<string, bufferlist> kv;
+ {
+ KeyValueDB::Transaction txn = db->get_transaction();
+ uint64_t txn_cost = 0;
+ const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
+ uint8_t new_flags = o->onode.flags |
+ bluestore_onode_t::FLAG_PERPOOL_OMAP |
+ bluestore_onode_t::FLAG_PERPG_OMAP;
+ const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
+
+ KeyValueDB::Iterator it = db->get_iterator(prefix);
+ string head, tail;
+ o->get_omap_header(&head);
+ o->get_omap_tail(&tail);
+ it->lower_bound(head);
+ // head
+ if (it->valid() && it->key() == head) {
+ dout(30) << __func__ << " got header" << dendl;
+ header = it->value();
+ if (header.length()) {
+ string new_head;
+ Onode::calc_omap_header(new_flags, o.get(), &new_head);
+ txn->set(new_omap_prefix, new_head, header);
+ txn_cost += new_head.length() + header.length();
+ }
+ it->next();
+ }
+ // tail
+ {
+ string new_tail;
+ Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
+ bufferlist empty;
+ txn->set(new_omap_prefix, new_tail, empty);
+ txn_cost += new_tail.length() + new_tail.length();
+ }
+ // values
+ string final_key;
+ Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
+ size_t base_key_len = final_key.size();
+ while (it->valid() && it->key() < tail) {
+ string user_key;
+ o->decode_omap_key(it->key(), &user_key);
+ dout(20) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+
+ final_key.resize(base_key_len);
+ final_key += user_key;
+ auto v = it->value();
+ txn->set(new_omap_prefix, final_key, v);
+ txn_cost += final_key.length() + v.length();
+
+ // submit a portion if cost exceeds 16MB
+ if (txn_cost >= 16 * (1 << 20) ) {
+ db->submit_transaction_sync(txn);
+ txn = db->get_transaction();
+ txn_cost = 0;
+ }
+ it->next();
+ }
+ if (txn_cost > 0) {
+ db->submit_transaction_sync(txn);
+ }
+ }
+ // finalize: remove legacy data
+ {
+ KeyValueDB::Transaction txn = db->get_transaction();
+ // remove old keys
+ const string& old_omap_prefix = o->get_omap_prefix();
+ string old_head, old_tail;
+ o->get_omap_header(&old_head);
+ o->get_omap_tail(&old_tail);
+ txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
+ txn->rmkey(old_omap_prefix, old_tail);
+ // set flag
+ o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
+ _record_onode(o, txn);
+ db->submit_transaction_sync(txn);
+ repairer->inc_repaired();
+ repairer->request_compaction();
+ }
+ }
+}
+
+void BlueStore::_fsck_check_objects(
+ FSCKDepth depth,
+ BlueStore::FSCK_ObjectCtx& ctx)
+{
+ auto& errors = ctx.errors;
+ auto sb_info_lock = ctx.sb_info_lock;
+ auto& sb_info = ctx.sb_info;
+ auto& sb_ref_counts = ctx.sb_ref_counts;
+ auto repairer = ctx.repairer;
+
+ uint64_t_btree_t used_nids;
+
+ size_t processed_myself = 0;
+
+ auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ mempool::bluestore_fsck::list<string> expecting_shards;
+ if (it) {
+ const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
+ typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
+ std::unique_ptr<WQ> wq(
+ new WQ(
+ "FSCKWorkQueue",
+ (thread_count ? : 1) * 32,
+ this,
+ sb_info_lock,
+ sb_info,
+ sb_ref_counts,
+ repairer));
+
+ ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
+
+ thread_pool.add_work_queue(wq.get());
+ if (depth == FSCK_SHALLOW && thread_count > 0) {
+ //not the best place but let's check anyway
+ ceph_assert(sb_info_lock);
+ thread_pool.start();
+ }
+
+ // fill global if not overriden below
+ CollectionRef c;
+ int64_t pool_id = -1;
+ spg_t pgid;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key()) << dendl;
+ if (is_extent_shard_key(it->key())) {
+ if (depth == FSCK_SHALLOW) {
+ continue;
+ }
+ while (!expecting_shards.empty() &&
+ expecting_shards.front() < it->key()) {
+ derr << "fsck error: missing shard key "
+ << pretty_binary_string(expecting_shards.front())
+ << dendl;
+ ++errors;
+ expecting_shards.pop_front();
+ }
+ if (!expecting_shards.empty() &&
+ expecting_shards.front() == it->key()) {
+ // all good
+ expecting_shards.pop_front();
+ continue;
+ }
+
+ uint32_t offset;
+ string okey;
+ get_key_extent_shard(it->key(), &okey, &offset);
+ derr << "fsck error: stray shard 0x" << std::hex << offset
+ << std::dec << dendl;
+ if (expecting_shards.empty()) {
+ derr << "fsck error: " << pretty_binary_string(it->key())
+ << " is unexpected" << dendl;
+ ++errors;
+ continue;
+ }
+ while (expecting_shards.front() > it->key()) {
+ derr << "fsck error: saw " << pretty_binary_string(it->key())
+ << dendl;
+ derr << "fsck error: exp "
+ << pretty_binary_string(expecting_shards.front()) << dendl;
+ ++errors;
+ expecting_shards.pop_front();
+ if (expecting_shards.empty()) {
+ break;
+ }
+ }
+ continue;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0) {
+ derr << "fsck error: bad object key "
+ << pretty_binary_string(it->key()) << dendl;
+ ++errors;
+ continue;
+ }
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
+ }
+ }
+ if (!c) {
+ derr << "fsck error: stray object " << oid
+ << " not owned by any collection" << dendl;
+ ++errors;
+ continue;
+ }
+ pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+ dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
+ << dendl;
+ }
+
+ if (depth != FSCK_SHALLOW &&
+ !expecting_shards.empty()) {
+ for (auto& k : expecting_shards) {
+ derr << "fsck error: missing shard key "
+ << pretty_binary_string(k) << dendl;
+ }
+ ++errors;
+ expecting_shards.clear();
+ }
+
+ bool queued = false;
+ if (depth == FSCK_SHALLOW && thread_count > 0) {
+ queued = wq->queue(
+ pool_id,
+ c,
+ oid,
+ it->key(),
+ it->value());
+ }
+ OnodeRef o;
+ map<BlobRef, bluestore_blob_t::unused_t> referenced;
+
+ if (!queued) {
+ ++processed_myself;
+ o = fsck_check_objects_shallow(
+ depth,
+ pool_id,
+ c,
+ oid,
+ it->key(),
+ it->value(),
+ &expecting_shards,
+ &referenced,
+ ctx);
+ }
+
+ if (depth != FSCK_SHALLOW) {
+ ceph_assert(o != nullptr);
+ if (o->onode.nid) {
+ if (o->onode.nid > nid_max) {
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
+ << " > nid_max " << nid_max << dendl;
+ ++errors;
+ }
+ if (used_nids.count(o->onode.nid)) {
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
+ << " already in use" << dendl;
+ ++errors;
+ continue; // go for next object
+ }
+ used_nids.insert(o->onode.nid);
+ }
+ for (auto& i : referenced) {
+ dout(20) << __func__ << " referenced 0x" << std::hex << i.second
+ << std::dec << " for " << *i.first << dendl;
+ const bluestore_blob_t& blob = i.first->get_blob();
+ if (i.second & blob.unused) {
+ derr << "fsck error: " << oid << " blob claims unused 0x"
+ << std::hex << blob.unused
+ << " but extents reference 0x" << i.second << std::dec
+ << " on blob " << *i.first << dendl;
+ ++errors;
+ }
+ if (blob.has_csum()) {
+ uint64_t blob_len = blob.get_logical_length();
+ uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
+ unsigned csum_count = blob.get_csum_count();
+ unsigned csum_chunk_size = blob.get_csum_chunk_size();
+ for (unsigned p = 0; p < csum_count; ++p) {
+ unsigned pos = p * csum_chunk_size;
+ unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
+ unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
+ unsigned mask = 1u << firstbit;
+ for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
+ mask |= 1u << b;
+ }
+ if ((blob.unused & mask) == mask) {
+ // this csum chunk region is marked unused
+ if (blob.get_csum_item(p) != 0) {
+ derr << "fsck error: " << oid
+ << " blob claims csum chunk 0x" << std::hex << pos
+ << "~" << csum_chunk_size
+ << " is unused (mask 0x" << mask << " of unused 0x"
+ << blob.unused << ") but csum is non-zero 0x"
+ << blob.get_csum_item(p) << std::dec << " on blob "
+ << *i.first << dendl;
+ ++errors;
+ }
+ }
+ }
+ }
+ }
+ // omap
+ if (o->onode.has_omap()) {
+ ceph_assert(ctx.used_omap_head);
+ if (ctx.used_omap_head->count(o->onode.nid)) {
+ derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
+ << " already in use" << dendl;
+ ++errors;
+ } else {
+ ctx.used_omap_head->insert(o->onode.nid);
+ }
+ } // if (o->onode.has_omap())
+ if (depth == FSCK_DEEP) {
+ bufferlist bl;
+ uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
+ uint64_t offset = 0;
+ do {
+ uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
+ int r = _do_read(c.get(), o, offset, l, bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ if (r < 0) {
+ ++errors;
+ derr << "fsck error: " << oid << std::hex
+ << " error during read: "
+ << " " << offset << "~" << l
+ << " " << cpp_strerror(r) << std::dec
+ << dendl;
+ break;
+ }
+ offset += l;
+ } while (offset < o->onode.size);
+ } // deep
+ } //if (depth != FSCK_SHALLOW)
+ } // for (it->lower_bound(string()); it->valid(); it->next())
+ if (depth == FSCK_SHALLOW && thread_count > 0) {
+ wq->finalize(thread_pool, ctx);
+ if (processed_myself) {
+ // may be needs more threads?
+ dout(0) << __func__ << " partial offload"
+ << ", done myself " << processed_myself
+ << " of " << ctx.num_objects
+ << "objects, threads " << thread_count
+ << dendl;
+ }
+ }
+ } // if (it)
+}
+/**
+An overview for currently implemented repair logics
+performed in fsck in two stages: detection(+preparation) and commit.
+Detection stage (in processing order):
+ (Issue -> Repair action to schedule)
+ - Detect undecodable keys for Shared Blobs -> Remove
+ - Detect undecodable records for Shared Blobs -> Remove
+ (might trigger missed Shared Blob detection below)
+ - Detect stray records for Shared Blobs -> Remove
+ - Detect misreferenced pextents -> Fix
+ Prepare Bloom-like filter to track cid/oid -> pextent
+ Prepare list of extents that are improperly referenced
+ Enumerate Onode records that might use 'misreferenced' pextents
+ (Bloom-like filter applied to reduce computation)
+ Per each questinable Onode enumerate all blobs and identify broken ones
+ (i.e. blobs having 'misreferences')
+ Rewrite each broken blob data by allocating another extents and
+ copying data there
+ If blob is shared - unshare it and mark corresponding Shared Blob
+ for removal
+ Release previously allocated space
+ Update Extent Map
+ - Detect missed Shared Blobs -> Recreate
+ - Detect undecodable deferred transaction -> Remove
+ - Detect Freelist Manager's 'false free' entries -> Mark as used
+ - Detect Freelist Manager's leaked entries -> Mark as free
+ - Detect statfs inconsistency - Update
+ Commit stage (separate DB commit per each step):
+ - Apply leaked FM entries fix
+ - Apply 'false free' FM entries fix
+ - Apply 'Remove' actions
+ - Apply fix for misreference pextents
+ - Apply Shared Blob recreate
+ (can be merged with the step above if misreferences were dectected)
+ - Apply StatFS update
+*/
+int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
+{
+ dout(5) << __func__
+ << (repair ? " repair" : " check")
+ << (depth == FSCK_DEEP ? " (deep)" :
+ depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+ << dendl;
+
+ // in deep mode we need R/W write access to be able to replay deferred ops
+ const bool read_only = !(repair || depth == FSCK_DEEP);
+ int r = _open_db_and_around(read_only);
+ if (r < 0) {
+ return r;
+ }
+ auto close_db = make_scope_guard([&] {
+ _close_db_and_around();
+ });
+
+ if (!read_only) {
+ r = _upgrade_super();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ // NullFreelistManager needs to open collection early
+ r = _open_collections();
+ if (r < 0) {
+ return r;
+ }
+
+ mempool_thread.init();
+ auto stop_mempool = make_scope_guard([&] {
+ mempool_thread.shutdown();
+ _shutdown_cache();
+ });
+ // we need finisher and kv_{sync,finalize}_thread *just* for replay
+ // enable in repair or deep mode modes only
+ if (!read_only) {
+ _kv_start();
+ r = _deferred_replay();
+ _kv_stop();
+ }
+
+ if (r < 0) {
+ return r;
+ }
+ return _fsck_on_open(depth, repair);
+}
+
+int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
+{
+ uint64_t sb_hash_size = uint64_t(
+ cct->_conf.get_val<Option::size_t>("osd_memory_target") *
+ cct->_conf.get_val<double>(
+ "bluestore_fsck_shared_blob_tracker_size"));
+
+ dout(1) << __func__
+ << " <<<START>>>"
+ << (repair ? " repair" : " check")
+ << (depth == FSCK_DEEP ? " (deep)" :
+ depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+ << " start sb_tracker_hash_size:" << sb_hash_size
+ << dendl;
+ int64_t errors = 0;
+ int64_t warnings = 0;
+ unsigned repaired = 0;
+
+ uint64_t_btree_t used_omap_head;
+ uint64_t_btree_t used_sbids;
+
+ mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
+ KeyValueDB::Iterator it;
+ store_statfs_t expected_store_statfs;
+ per_pool_statfs expected_pool_statfs;
+
+ sb_info_space_efficient_map_t sb_info;
+ shared_blob_2hash_tracker_t sb_ref_counts(
+ sb_hash_size,
+ min_alloc_size);
+ size_t sb_ref_mismatches = 0;
+
+ /// map of oid -> (first_)offset for each zone
+ std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM!
+
+ uint64_t num_objects = 0;
+ uint64_t num_extents = 0;
+ uint64_t num_blobs = 0;
+ uint64_t num_spanning_blobs = 0;
+ uint64_t num_shared_blobs = 0;
+ uint64_t num_sharded_objects = 0;
+ BlueStoreRepairer repairer;
+
+ auto alloc_size = fm->get_alloc_size();
+
+ utime_t start = ceph_clock_now();
+
+ _fsck_collections(&errors);
+ used_blocks.resize(fm->get_alloc_units());
+
+ if (bluefs) {
+ interval_set<uint64_t> bluefs_extents;
+
+ bluefs->foreach_block_extents(
+ bluefs_layout.shared_bdev,
+ [&](uint64_t start, uint32_t len) {
+ apply_for_bitset_range(start, len, alloc_size, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset& bs) {
+ ceph_assert(pos < bs.size());
+ bs.set(pos);
+ }
+ );
+ }
+ );
+ }
+
+ bluefs_used_blocks = used_blocks;
+
+ apply_for_bitset_range(
+ 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ bs.set(pos);
+ }
+ );
+
+
+ if (repair) {
+ repairer.init_space_usage_tracker(
+ bdev->get_size(),
+ min_alloc_size);
+ }
+
+ if (bluefs) {
+ int r = bluefs->fsck();
+ if (r < 0) {
+ return r;
+ }
+ if (r > 0)
+ errors += r;
+ }
+
+ if (!per_pool_stat_collection) {
+ const char *w;
+ if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
+ w = "error";
+ ++errors;
+ } else {
+ w = "warning";
+ ++warnings;
+ }
+ derr << "fsck " << w << ": store not yet converted to per-pool stats"
+ << dendl;
+ }
+ if (per_pool_omap != OMAP_PER_PG) {
+ const char *w;
+ if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
+ w = "error";
+ ++errors;
+ } else {
+ w = "warning";
+ ++warnings;
+ }
+ derr << "fsck " << w << ": store not yet converted to per-pg omap"
+ << dendl;
+ }
+
+ if (g_conf()->bluestore_debug_fsck_abort) {
+ dout(1) << __func__ << " debug abort" << dendl;
+ goto out_scan;
+ }
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ auto a = dynamic_cast<ZonedAllocator*>(alloc);
+ ceph_assert(a);
+ auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+ ceph_assert(f);
+ vector<uint64_t> wp = bdev->get_zones();
+ vector<zone_state_t> zones = f->get_zone_states(db);
+ ceph_assert(wp.size() == zones.size());
+ auto num_zones = bdev->get_size() / zone_size;
+ for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
+ uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
+ if (zones[i].write_pointer > p &&
+ zones[i].num_dead_bytes < zones[i].write_pointer) {
+ derr << "fsck error: zone 0x" << std::hex << i
+ << " bluestore write pointer 0x" << zones[i].write_pointer
+ << " > device write pointer 0x" << p
+ << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
+ << std::dec << dendl;
+ ++errors;
+ }
+ }
+
+ if (depth != FSCK_SHALLOW) {
+ // load zone refs
+ zone_refs.resize(bdev->get_size() / zone_size);
+ it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ for (it->lower_bound(string());
+ it->valid();
+ it->next()) {
+ uint32_t zone = 0;
+ uint64_t offset = 0;
+ ghobject_t oid;
+ string key = it->key();
+ int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
+ if (r < 0) {
+ derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
+ << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
+ }
+ ++errors;
+ continue;
+ }
+ dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
+ << " -> " << std::dec << oid << dendl;
+ if (zone_refs[zone].count(oid)) {
+ derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
+ << " offset 0x" << offset << std::dec << " for " << oid << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
+ }
+ ++errors;
+ continue;
+ }
+ zone_refs[zone][oid] = offset;
+ }
+ }
+ }
+ }
+#endif
+
+ dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
+ it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ string key = it->key();
+ uint64_t sbid;
+ if (get_key_shared_blob(key, &sbid) < 0) {
+ // Failed to parse the key.
+ // This gonna to be handled at the second stage
+ continue;
+ }
+ bluestore_shared_blob_t shared_blob(sbid);
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ decode(shared_blob, blp);
+ }
+ catch (ceph::buffer::error& e) {
+ // this gonna to be handled at the second stage
+ continue;
+ }
+ dout(20) << __func__ << " " << shared_blob << dendl;
+ auto& sbi = sb_info.add_maybe_stray(sbid);
+
+ // primarily to silent the 'unused' warning
+ ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
+
+ for (auto& r : shared_blob.ref_map.ref_map) {
+ sb_ref_counts.inc_range(
+ sbid,
+ r.first,
+ r.second.length,
+ -r.second.refs);
+ }
+ }
+ } // if (it) //checking shared_blobs (phase1)
+
+ // walk PREFIX_OBJ
+ {
+ dout(1) << __func__ << " walking object keyspace" << dendl;
+ ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
+ BlueStore::FSCK_ObjectCtx ctx(
+ errors,
+ warnings,
+ num_objects,
+ num_extents,
+ num_blobs,
+ num_sharded_objects,
+ num_spanning_blobs,
+ &used_blocks,
+ &used_omap_head,
+ &zone_refs,
+ //no need for the below lock when in non-shallow mode as
+ // there is no multithreading in this case
+ depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
+ sb_info,
+ sb_ref_counts,
+ expected_store_statfs,
+ expected_pool_statfs,
+ repair ? &repairer : nullptr);
+
+ _fsck_check_objects(depth, ctx);
+ }
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr() && depth != FSCK_SHALLOW) {
+ dout(1) << __func__ << " checking for leaked zone refs" << dendl;
+ for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
+ for (auto& [oid, offset] : zone_refs[zone]) {
+ derr << "fsck error: stray zone ref 0x" << std::hex << zone
+ << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
+ // FIXME: add repair
+ ++errors;
+ }
+ }
+ }
+#endif
+
+ sb_ref_mismatches = sb_ref_counts.count_non_zero();
+ if (sb_ref_mismatches != 0) {
+ derr << "fsck error:" << "*" << sb_ref_mismatches
+ << " shared blob references aren't matching, at least "
+ << sb_ref_mismatches << " found" << dendl;
+ errors += sb_ref_mismatches;
+ }
+
+ if (depth != FSCK_SHALLOW && repair) {
+ _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
+ }
+ dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
+ it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ // FIXME minor: perhaps simplify for shallow mode?
+ // fill global if not overriden below
+ auto expected_statfs = &expected_store_statfs;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ string key = it->key();
+ uint64_t sbid;
+ if (get_key_shared_blob(key, &sbid)) {
+ derr << "fsck error: bad key '" << key
+ << "' in shared blob namespace" << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
+ ++errors;
+ continue;
+ }
+ auto p = sb_info.find(sbid);
+ if (p == sb_info.end()) {
+ if (sb_ref_mismatches > 0) {
+ // highly likely this has been already reported before, ignoring...
+ dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
+ << std::hex << sbid << std::dec << dendl;
+ } else {
+ derr<< "fsck error: found stray shared blob data for sbid 0x"
+ << std::hex << sbid << std::dec << dendl;
+ ++errors;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
+ }
+ } else {
+ ++num_shared_blobs;
+ sb_info_t& sbi = *p;
+ bluestore_shared_blob_t shared_blob(sbid);
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ decode(shared_blob, blp);
+ }
+ catch (ceph::buffer::error& e) {
+ ++errors;
+
+ derr << "fsck error: failed to decode Shared Blob"
+ << pretty_binary_string(key) << dendl;
+ if (repair) {
+ dout(20) << __func__ << " undecodable Shared Blob, key:'"
+ << pretty_binary_string(key)
+ << "', removing" << dendl;
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
+ continue;
+ }
+ dout(20) << __func__ << " " << shared_blob << dendl;
+ PExtentVector extents;
+ for (auto& r : shared_blob.ref_map.ref_map) {
+ extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
+ }
+ if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
+ (per_pool_stat_collection || repair)) {
+ expected_statfs = &expected_pool_statfs[sbi.pool_id];
+ }
+ std::stringstream ss;
+ ss << "sbid 0x" << std::hex << sbid << std::dec;
+ errors += _fsck_check_extents(ss.str(),
+ extents,
+ sbi.allocated_chunks < 0,
+ used_blocks,
+ fm->get_alloc_size(),
+ repair ? &repairer : nullptr,
+ *expected_statfs,
+ depth);
+ }
+ }
+ } // if (it) /* checking shared_blobs (phase 2)*/
+
+ if (repair && repairer.preprocess_misreference(db)) {
+
+ dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
+ auto& misref_extents = repairer.get_misreferences();
+ interval_set<uint64_t> to_release;
+ it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ // fill global if not overriden below
+ auto expected_statfs = &expected_store_statfs;
+
+ CollectionRef c;
+ spg_t pgid;
+ KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
+ bool bypass_rest = false;
+ for (it->lower_bound(string()); it->valid() && !bypass_rest;
+ it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key()) << dendl;
+ if (is_extent_shard_key(it->key())) {
+ continue;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0 || !repairer.is_used(oid)) {
+ continue;
+ }
+
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
+ }
+ }
+ if (!c) {
+ continue;
+ }
+ if (per_pool_stat_collection || repair) {
+ auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+ expected_statfs = &expected_pool_statfs[pool_id];
+ }
+ }
+ if (!repairer.is_used(c->cid)) {
+ continue;
+ }
+
+ dout(20) << __func__ << " check misreference for col:" << c->cid
+ << " obj:" << oid << dendl;
+
+ OnodeRef o;
+ o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ mempool::bluestore_fsck::set<BlobRef> blobs;
+
+ for (auto& e : o->extent_map.extent_map) {
+ blobs.insert(e.blob);
+ }
+ bool need_onode_update = false;
+ bool first_dump = true;
+ for(auto b : blobs) {
+ bool broken_blob = false;
+ auto& pextents = b->dirty_blob().dirty_extents();
+ for (auto& e : pextents) {
+ if (!e.is_valid()) {
+ continue;
+ }
+ // for the sake of simplicity and proper shared blob handling
+ // always rewrite the whole blob even when it's partially
+ // misreferenced.
+ if (misref_extents.intersects(e.offset, e.length)) {
+ if (first_dump) {
+ first_dump = false;
+ _dump_onode<10>(cct, *o);
+ }
+ broken_blob = true;
+ break;
+ }
+ }
+ if (!broken_blob)
+ continue;
+ bool compressed = b->get_blob().is_compressed();
+ need_onode_update = true;
+ dout(10) << __func__
+ << " fix misreferences in oid:" << oid
+ << " " << *b << dendl;
+ uint64_t b_off = 0;
+ PExtentVector pext_to_release;
+ pext_to_release.reserve(pextents.size());
+ // rewriting all valid pextents
+ for (auto e = pextents.begin(); e != pextents.end();
+ e++) {
+ auto b_off_cur = b_off;
+ b_off += e->length;
+ if (!e->is_valid()) {
+ continue;
+ }
+ PExtentVector exts;
+ dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
+ int64_t alloc_len =
+ alloc->allocate(e->length, min_alloc_size,
+ 0, 0, &exts);
+ if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
+ derr << __func__
+ << " failed to allocate 0x" << std::hex << e->length
+ << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
+ << " min_alloc_size 0x" << min_alloc_size
+ << " available 0x " << alloc->get_free()
+ << std::dec << dendl;
+ if (alloc_len > 0) {
+ alloc->release(exts);
+ }
+ bypass_rest = true;
+ break;
+ }
+ expected_statfs->allocated += e->length;
+ if (compressed) {
+ expected_statfs->data_compressed_allocated += e->length;
+ }
+
+ bufferlist bl;
+ IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
+ r = bdev->read(e->offset, e->length, &bl, &ioc, false);
+ if (r < 0) {
+ derr << __func__ << " failed to read from 0x" << std::hex << e->offset
+ <<"~" << e->length << std::dec << dendl;
+ ceph_abort_msg("read failed, wtf");
+ }
+ pext_to_release.push_back(*e);
+ e = pextents.erase(e);
+ e = pextents.insert(e, exts.begin(), exts.end());
+ b->get_blob().map_bl(
+ b_off_cur, bl,
+ [&](uint64_t offset, bufferlist& t) {
+ int r = bdev->write(offset, t, false);
+ ceph_assert(r == 0);
+ });
+ e += exts.size() - 1;
+ for (auto& p : exts) {
+ fm->allocate(p.offset, p.length, txn);
+ }
+ } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
+
+ if (b->get_blob().is_shared()) {
+ b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
+
+ auto sbid = b->shared_blob->get_sbid();
+ auto sb_it = sb_info.find(sbid);
+ ceph_assert(sb_it != sb_info.end());
+ sb_info_t& sbi = *sb_it;
+
+ if (sbi.allocated_chunks < 0) {
+ // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
+ // as we originally used that value while accumulating
+ // expected_statfs
+ expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+ expected_statfs->data_compressed_allocated -=
+ uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+ } else {
+ expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
+ }
+ sbi.allocated_chunks = 0;
+ repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+
+ // relying on blob's pextents to decide what to release.
+ for (auto& p : pext_to_release) {
+ to_release.union_insert(p.offset, p.length);
+ }
+ } else {
+ for (auto& p : pext_to_release) {
+ expected_statfs->allocated -= p.length;
+ if (compressed) {
+ expected_statfs->data_compressed_allocated -= p.length;
+ }
+ to_release.union_insert(p.offset, p.length);
+ }
+ }
+ if (bypass_rest) {
+ break;
+ }
+ } // for(auto b : blobs)
+ if (need_onode_update) {
+ o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
+ _record_onode(o, txn);
+ }
+ } // for (it->lower_bound(string()); it->valid(); it->next())
+
+ for (auto it = to_release.begin(); it != to_release.end(); ++it) {
+ dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
+ << "~" << it.get_len() << std::dec << dendl;
+ fm->release(it.get_start(), it.get_len(), txn);
+ }
+ alloc->release(to_release);
+ to_release.clear();
+ } // if (it) {
+ } //if (repair && repairer.preprocess_misreference()) {
+ sb_info.clear();
+ sb_ref_counts.reset();
+
+ dout(1) << __func__ << " checking pool_statfs" << dendl;
+ _fsck_check_statfs(expected_store_statfs, expected_pool_statfs,
+ errors, warnings, repair ? &repairer : nullptr);
+ if (depth != FSCK_SHALLOW) {
+ dout(1) << __func__ << " checking for stray omap data " << dendl;
+ it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ uint64_t last_omap_head = 0;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ uint64_t omap_head;
+
+ _key_decode_u64(it->key().c_str(), &omap_head);
+
+ if (used_omap_head.count(omap_head) == 0 &&
+ omap_head != last_omap_head) {
+ pair<string,string> rk = it->raw_key();
+ fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+ << "fsck error: found stray omap data on omap_head "
+ << omap_head << " " << last_omap_head
+ << " prefix/key: " << url_escape(rk.first)
+ << " " << url_escape(rk.second)
+ << fsck_dendl;
+ ++errors;
+ last_omap_head = omap_head;
+ }
+ }
+ }
+ it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ uint64_t last_omap_head = 0;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ uint64_t omap_head;
+ _key_decode_u64(it->key().c_str(), &omap_head);
+ if (used_omap_head.count(omap_head) == 0 &&
+ omap_head != last_omap_head) {
+ pair<string,string> rk = it->raw_key();
+ fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+ << "fsck error: found stray (pgmeta) omap data on omap_head "
+ << omap_head << " " << last_omap_head
+ << " prefix/key: " << url_escape(rk.first)
+ << " " << url_escape(rk.second)
+ << fsck_dendl;
+ last_omap_head = omap_head;
+ ++errors;
+ }
+ }
+ }
+ it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ uint64_t last_omap_head = 0;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ uint64_t pool;
+ uint64_t omap_head;
+ string k = it->key();
+ const char *c = k.c_str();
+ c = _key_decode_u64(c, &pool);
+ c = _key_decode_u64(c, &omap_head);
+ if (used_omap_head.count(omap_head) == 0 &&
+ omap_head != last_omap_head) {
+ pair<string,string> rk = it->raw_key();
+ fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+ << "fsck error: found stray (per-pool) omap data on omap_head "
+ << omap_head << " " << last_omap_head
+ << " prefix/key: " << url_escape(rk.first)
+ << " " << url_escape(rk.second)
+ << fsck_dendl;
+ ++errors;
+ last_omap_head = omap_head;
+ }
+ }
+ }
+ it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ uint64_t last_omap_head = 0;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ uint64_t pool;
+ uint32_t hash;
+ uint64_t omap_head;
+ string k = it->key();
+ const char* c = k.c_str();
+ c = _key_decode_u64(c, &pool);
+ c = _key_decode_u32(c, &hash);
+ c = _key_decode_u64(c, &omap_head);
+ if (used_omap_head.count(omap_head) == 0 &&
+ omap_head != last_omap_head) {
+ fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+ << "fsck error: found stray (per-pg) omap data on omap_head "
+ << " key " << pretty_binary_string(it->key())
+ << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
+ ++errors;
+ last_omap_head = omap_head;
+ }
+ }
+ }
+ dout(1) << __func__ << " checking deferred events" << dendl;
+ it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ bluestore_deferred_transaction_t wt;
+ try {
+ decode(wt, p);
+ } catch (ceph::buffer::error& e) {
+ derr << "fsck error: failed to decode deferred txn "
+ << pretty_binary_string(it->key()) << dendl;
+ if (repair) {
+ dout(20) << __func__ << " undecodable deferred TXN record, key: '"
+ << pretty_binary_string(it->key())
+ << "', removing" << dendl;
+ repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+ }
+ continue;
+ }
+ dout(20) << __func__ << " deferred " << wt.seq
+ << " ops " << wt.ops.size()
+ << " released 0x" << std::hex << wt.released << std::dec << dendl;
+ for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
+ apply_for_bitset_range(
+ e.get_start(), e.get_len(), alloc_size, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ bs.set(pos);
+ }
+ );
+ }
+ }
+ }
+
+ // skip freelist vs allocated compare when we have Null fm
+ if (!fm->is_null_manager()) {
+ dout(1) << __func__ << " checking freelist vs allocated" << dendl;
+#ifdef HAVE_LIBZBD
+ if (freelist_type == "zoned") {
+ // verify per-zone state
+ // - verify no allocations beyond write pointer
+ // - verify num_dead_bytes count (neither allocated nor
+ // free space past the write pointer)
+ auto a = dynamic_cast<ZonedAllocator*>(alloc);
+ auto num_zones = bdev->get_size() / zone_size;
+
+ // mark the free space past the write pointer
+ for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
+ auto wp = a->get_write_pointer(zone);
+ uint64_t offset = zone_size * zone + wp;
+ uint64_t length = zone_size - wp;
+ if (!length) {
+ continue;
+ }
+ bool intersects = false;
+ dout(10) << " marking zone 0x" << std::hex << zone
+ << " region after wp 0x" << offset << "~" << length
+ << std::dec << dendl;
+ apply_for_bitset_range(
+ offset, length, alloc_size, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ if (bs.test(pos)) {
+ derr << "fsck error: zone 0x" << std::hex << zone
+ << " has used space at 0x" << pos * alloc_size
+ << " beyond write pointer 0x" << wp
+ << std::dec << dendl;
+ intersects = true;
+ } else {
+ bs.set(pos);
+ }
+ }
+ );
+ if (intersects) {
+ ++errors;
+ }
+ }
+
+ used_blocks.flip();
+
+ // skip conventional zones
+ uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
+ pos = used_blocks.find_next(pos);
+
+ uint64_t zone_dead = 0;
+ for (uint32_t zone = first_sequential_zone;
+ zone < num_zones;
+ ++zone, zone_dead = 0) {
+ while (pos != decltype(used_blocks)::npos &&
+ (pos * min_alloc_size) / zone_size == zone) {
+ dout(40) << " zone 0x" << std::hex << zone
+ << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
+ << std::dec << dendl;
+ zone_dead += min_alloc_size;
+ pos = used_blocks.find_next(pos);
+ }
+ dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
+ << std::dec << dendl;
+ // cross-check dead bytes against zone state
+ if (a->get_dead_bytes(zone) != zone_dead) {
+ derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
+ << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
+ << dendl;
+ ++errors;
+ // TODO: repair
+ }
+ }
+ used_blocks.flip();
+ } else
+#endif
+ {
+ fm->enumerate_reset();
+ uint64_t offset, length;
+ while (fm->enumerate_next(db, &offset, &length)) {
+ bool intersects = false;
+ apply_for_bitset_range(
+ offset, length, alloc_size, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
+ if (offset == SUPER_RESERVED &&
+ length == min_alloc_size - SUPER_RESERVED) {
+ // this is due to the change just after luminous to min_alloc_size
+ // granularity allocations, and our baked in assumption at the top
+ // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+ // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
+ // since we will never allocate this region below min_alloc_size.
+ dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+ << " and min_alloc_size, 0x" << std::hex << offset << "~"
+ << length << std::dec << dendl;
+ } else {
+ intersects = true;
+ if (repair) {
+ repairer.fix_false_free(db, fm,
+ pos * min_alloc_size,
+ min_alloc_size);
+ }
+ }
+ } else {
+ bs.set(pos);
+ }
+ }
+ );
+ if (intersects) {
+ derr << "fsck error: free extent 0x" << std::hex << offset
+ << "~" << length << std::dec
+ << " intersects allocated blocks" << dendl;
+ ++errors;
+ }
+ }
+ fm->enumerate_reset();
+
+ // check for leaked extents
+ size_t count = used_blocks.count();
+ if (used_blocks.size() != count) {
+ ceph_assert(used_blocks.size() > count);
+ used_blocks.flip();
+ size_t start = used_blocks.find_first();
+ while (start != decltype(used_blocks)::npos) {
+ size_t cur = start;
+ while (true) {
+ size_t next = used_blocks.find_next(cur);
+ if (next != cur + 1) {
+ ++errors;
+ derr << "fsck error: leaked extent 0x" << std::hex
+ << ((uint64_t)start * fm->get_alloc_size()) << "~"
+ << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
+ << dendl;
+ if (repair) {
+ repairer.fix_leaked(db,
+ fm,
+ start * min_alloc_size,
+ (cur + 1 - start) * min_alloc_size);
+ }
+ start = next;
+ break;
+ }
+ cur = next;
+ }
+ }
+ used_blocks.flip();
+ }
+ }
+ }
+ }
+ if (repair) {
+ if (per_pool_omap != OMAP_PER_PG) {
+ dout(5) << __func__ << " fixing per_pg_omap" << dendl;
+ repairer.fix_per_pool_omap(db, OMAP_PER_PG);
+ }
+
+ dout(5) << __func__ << " applying repair results" << dendl;
+ repaired = repairer.apply(db);
+ dout(5) << __func__ << " repair applied" << dendl;
+ }
+
+out_scan:
+ dout(2) << __func__ << " " << num_objects << " objects, "
+ << num_sharded_objects << " of them sharded. "
+ << dendl;
+ dout(2) << __func__ << " " << num_extents << " extents to "
+ << num_blobs << " blobs, "
+ << num_spanning_blobs << " spanning, "
+ << num_shared_blobs << " shared."
+ << dendl;
+
+ utime_t duration = ceph_clock_now() - start;
+ dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
+ << warnings << " warnings, "
+ << repaired << " repaired, "
+ << (errors + warnings - (int)repaired) << " remaining in "
+ << duration << " seconds" << dendl;
+
+ // In non-repair mode we should return error count only as
+ // it indicates if store status is OK.
+ // In repair mode both errors and warnings are taken into account
+ // since repaired counter relates to them both.
+ return repair ? errors + warnings - (int)repaired : errors;
+}
+
+/// methods to inject various errors fsck can repair
+void BlueStore::inject_broken_shared_blob_key(const string& key,
+ const bufferlist& bl)
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+ txn->set(PREFIX_SHARED_BLOB, key, bl);
+ db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_no_shared_blob_key()
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+ ceph_assert(blobid_last > 0);
+ // kill the last used sbid, this can be broken due to blobid preallocation
+ // in rare cases, leaving as-is for the sake of simplicity
+ uint64_t sbid = blobid_last;
+
+ string key;
+ dout(5) << __func__<< " " << sbid << dendl;
+ get_shared_blob_key(sbid, &key);
+ txn->rmkey(PREFIX_SHARED_BLOB, key);
+ db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ dout(5) << __func__ << " " << sbid << dendl;
+
+ string key;
+ get_shared_blob_key(sbid, &key);
+ bluestore_shared_blob_t persistent(sbid);
+ persistent.ref_map.get(0xdead0000, min_alloc_size);
+ bufferlist bl;
+ encode(persistent, bl);
+ dout(20) << __func__ << " sbid " << sbid
+ << " takes " << bl.length() << " bytes, updating"
+ << dendl;
+
+ txn->set(PREFIX_SHARED_BLOB, key, bl);
+ db->submit_transaction_sync(txn);
+};
+
+
+void BlueStore::inject_leaked(uint64_t len)
+{
+ PExtentVector exts;
+ int64_t alloc_len = alloc->allocate(len, min_alloc_size,
+ min_alloc_size * 256, 0, &exts);
+
+ if (fm->is_null_manager()) {
+ return;
+ }
+
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ ceph_assert(alloc_len >= (int64_t)len);
+ for (auto& p : exts) {
+ fm->allocate(p.offset, p.length, txn);
+ }
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
+{
+ ceph_assert(!fm->is_null_manager());
+
+ KeyValueDB::Transaction txn;
+ OnodeRef o;
+ CollectionRef c = _get_collection(cid);
+ ceph_assert(c);
+ {
+ std::unique_lock l{c->lock}; // just to avoid internal asserts
+ o = c->get_onode(oid, false);
+ ceph_assert(o);
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ }
+
+ bool injected = false;
+ txn = db->get_transaction();
+ auto& em = o->extent_map.extent_map;
+ std::vector<const PExtentVector*> v;
+ if (em.size()) {
+ v.push_back(&em.begin()->blob->get_blob().get_extents());
+ }
+ if (em.size() > 1) {
+ auto it = em.end();
+ --it;
+ v.push_back(&(it->blob->get_blob().get_extents()));
+ }
+ for (auto pext : v) {
+ if (pext->size()) {
+ auto p = pext->begin();
+ while (p != pext->end()) {
+ if (p->is_valid()) {
+ dout(20) << __func__ << " release 0x" << std::hex << p->offset
+ << "~" << p->length << std::dec << dendl;
+ fm->release(p->offset, p->length, txn);
+ injected = true;
+ break;
+ }
+ ++p;
+ }
+ }
+ }
+ ceph_assert(injected);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_legacy_omap()
+{
+ dout(1) << __func__ << dendl;
+ per_pool_omap = OMAP_BULK;
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+ txn->rmkey(PREFIX_SUPER, "per_pool_omap");
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
+{
+ dout(1) << __func__ << " "
+ << cid << " " << oid
+ <<dendl;
+ KeyValueDB::Transaction txn;
+ OnodeRef o;
+ CollectionRef c = _get_collection(cid);
+ ceph_assert(c);
+ {
+ std::unique_lock l{ c->lock }; // just to avoid internal asserts
+ o = c->get_onode(oid, false);
+ ceph_assert(o);
+ }
+ o->onode.clear_flag(
+ bluestore_onode_t::FLAG_PERPG_OMAP |
+ bluestore_onode_t::FLAG_PERPOOL_OMAP |
+ bluestore_onode_t::FLAG_PGMETA_OMAP);
+ txn = db->get_transaction();
+ _record_onode(o, txn);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_stray_omap(uint64_t head, const string& name)
+{
+ dout(1) << __func__ << dendl;
+ KeyValueDB::Transaction txn = db->get_transaction();
+
+ string key;
+ bufferlist bl;
+ _key_encode_u64(head, &key);
+ key.append(name);
+ txn->set(PREFIX_OMAP, key, bl);
+
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
+{
+ BlueStoreRepairer repairer;
+ repairer.fix_statfs(db, key, new_statfs);
+ repairer.apply(db);
+}
+
+void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ volatile_statfs v;
+ v = new_statfs;
+ bufferlist bl;
+ v.encode(bl);
+ t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ db->submit_transaction_sync(t);
+}
+
+void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
+ coll_t cid2, ghobject_t oid2,
+ uint64_t offset)
+{
+ OnodeRef o1;
+ CollectionRef c1 = _get_collection(cid1);
+ ceph_assert(c1);
+ {
+ std::unique_lock l{c1->lock}; // just to avoid internal asserts
+ o1 = c1->get_onode(oid1, false);
+ ceph_assert(o1);
+ o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+ }
+ OnodeRef o2;
+ CollectionRef c2 = _get_collection(cid2);
+ ceph_assert(c2);
+ {
+ std::unique_lock l{c2->lock}; // just to avoid internal asserts
+ o2 = c2->get_onode(oid2, false);
+ ceph_assert(o2);
+ o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+ }
+ Extent& e1 = *(o1->extent_map.seek_lextent(offset));
+ Extent& e2 = *(o2->extent_map.seek_lextent(offset));
+
+ // require onode/extent layout to be the same (and simple)
+ // to make things easier
+ ceph_assert(o1->onode.extent_map_shards.empty());
+ ceph_assert(o2->onode.extent_map_shards.empty());
+ ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
+ ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
+ ceph_assert(e1.logical_offset == e2.logical_offset);
+ ceph_assert(e1.length == e2.length);
+ ceph_assert(e1.blob_offset == e2.blob_offset);
+
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ // along with misreference error this will create space leaks errors
+ e2.blob->dirty_blob() = e1.blob->get_blob();
+ o2->extent_map.dirty_range(offset, e2.length);
+ o2->extent_map.update(txn, false);
+
+ _record_onode(o2, txn);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
+ int16_t blob_id)
+{
+ OnodeRef o;
+ CollectionRef c = _get_collection(cid);
+ ceph_assert(c);
+ {
+ std::unique_lock l{ c->lock }; // just to avoid internal asserts
+ o = c->get_onode(oid, false);
+ ceph_assert(o);
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ }
+
+ BlobRef b = c->new_blob();
+ b->id = blob_id;
+ o->extent_map.spanning_blob_map[blob_id] = b;
+
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ _record_onode(o, txn);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
+{
+ ceph_assert(bluefs);
+
+ BlueFS::FileWriter* p_handle = nullptr;
+ auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
+ ceph_assert(ret == 0);
+
+ std::string s('0', new_size);
+ bufferlist bl;
+ bl.append(s);
+ p_handle->append(bl);
+
+ bluefs->fsync(p_handle);
+ bluefs->close_writer(p_handle);
+}
+
+void BlueStore::collect_metadata(map<string,string> *pm)
+{
+ dout(10) << __func__ << dendl;
+ bdev->collect_metadata("bluestore_bdev_", pm);
+ if (bluefs) {
+ (*pm)["bluefs"] = "1";
+ // this value is for backward compatibility only
+ (*pm)["bluefs_single_shared_device"] = \
+ stringify((int)bluefs_layout.single_shared_device());
+ (*pm)["bluefs_dedicated_db"] = \
+ stringify((int)bluefs_layout.dedicated_db);
+ (*pm)["bluefs_dedicated_wal"] = \
+ stringify((int)bluefs_layout.dedicated_wal);
+ bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
+ } else {
+ (*pm)["bluefs"] = "0";
+ }
+
+ // report numa mapping for underlying devices
+ int node = -1;
+ set<int> nodes;
+ set<string> failed;
+ int r = get_numa_node(&node, &nodes, &failed);
+ if (r >= 0) {
+ if (!failed.empty()) {
+ (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
+ }
+ if (!nodes.empty()) {
+ dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
+ (*pm)["objectstore_numa_nodes"] = stringify(nodes);
+ }
+ if (node >= 0) {
+ (*pm)["objectstore_numa_node"] = stringify(node);
+ }
+ }
+ (*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size);
+}
+
+int BlueStore::get_numa_node(
+ int *final_node,
+ set<int> *out_nodes,
+ set<string> *out_failed)
+{
+ int node = -1;
+ set<string> devices;
+ get_devices(&devices);
+ set<int> nodes;
+ set<string> failed;
+ for (auto& devname : devices) {
+ int n;
+ BlkDev bdev(devname);
+ int r = bdev.get_numa_node(&n);
+ if (r < 0) {
+ dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
+ << dendl;
+ failed.insert(devname);
+ continue;
+ }
+ dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
+ << dendl;
+ nodes.insert(n);
+ if (node < 0) {
+ node = n;
+ }
+ }
+ if (node >= 0 && nodes.size() == 1 && failed.empty()) {
+ *final_node = node;
+ }
+ if (out_nodes) {
+ *out_nodes = nodes;
+ }
+ if (out_failed) {
+ *out_failed = failed;
+ }
+ return 0;
+}
+
+void BlueStore::prepare_for_fast_shutdown()
+{
+ m_fast_shutdown = true;
+}
+
+int BlueStore::get_devices(set<string> *ls)
+{
+ if (bdev) {
+ bdev->get_devices(ls);
+ if (bluefs) {
+ bluefs->get_devices(ls);
+ }
+ return 0;
+ }
+
+ // grumble, we haven't started up yet.
+ if (int r = _open_path(); r < 0) {
+ return r;
+ }
+ auto close_path = make_scope_guard([&] {
+ _close_path();
+ });
+ if (int r = _open_fsid(false); r < 0) {
+ return r;
+ }
+ auto close_fsid = make_scope_guard([&] {
+ _close_fsid();
+ });
+ if (int r = _read_fsid(&fsid); r < 0) {
+ return r;
+ }
+ if (int r = _lock_fsid(); r < 0) {
+ return r;
+ }
+ if (int r = _open_bdev(false); r < 0) {
+ return r;
+ }
+ auto close_bdev = make_scope_guard([&] {
+ _close_bdev();
+ });
+ if (int r = _minimal_open_bluefs(false); r < 0) {
+ return r;
+ }
+ bdev->get_devices(ls);
+ if (bluefs) {
+ bluefs->get_devices(ls);
+ }
+ _minimal_close_bluefs();
+ return 0;
+}
+
+void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
+{
+ buf->reset();
+
+ auto prefix = per_pool_omap == OMAP_BULK ?
+ PREFIX_OMAP :
+ per_pool_omap == OMAP_PER_POOL ?
+ PREFIX_PERPOOL_OMAP :
+ PREFIX_PERPG_OMAP;
+ buf->omap_allocated =
+ db->estimate_prefix_size(prefix, string());
+
+ uint64_t bfree = alloc->get_free();
+
+ if (bluefs) {
+ buf->internally_reserved = 0;
+ // include dedicated db, too, if that isn't the shared device.
+ if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
+ buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+ }
+ // call any non-omap bluefs space "internal metadata"
+ buf->internal_metadata =
+ bluefs->get_used()
+ - buf->omap_allocated;
+ }
+
+ ExtBlkDevState ebd_state;
+ int rc = bdev->get_ebd_state(ebd_state);
+ if (rc == 0) {
+ buf->total += ebd_state.get_physical_total();
+
+ // we are limited by both the size of the virtual device and the
+ // underlying physical device.
+ bfree = std::min(bfree, ebd_state.get_physical_avail());
+
+ buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();;
+ } else {
+ buf->total += bdev->get_size();
+ }
+ buf->available = bfree;
+}
+
+int BlueStore::statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts)
+{
+ if (alerts) {
+ alerts->clear();
+ _log_alerts(*alerts);
+ }
+ _get_statfs_overall(buf);
+ {
+ std::lock_guard l(vstatfs_lock);
+ buf->allocated = vstatfs.allocated();
+ buf->data_stored = vstatfs.stored();
+ buf->data_compressed = vstatfs.compressed();
+ buf->data_compressed_original = vstatfs.compressed_original();
+ buf->data_compressed_allocated = vstatfs.compressed_allocated();
+ }
+
+ dout(20) << __func__ << " " << *buf << dendl;
+ return 0;
+}
+
+int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+ bool *out_per_pool_omap)
+{
+ dout(20) << __func__ << " pool " << pool_id<< dendl;
+
+ if (!per_pool_stat_collection) {
+ dout(20) << __func__ << " not supported in legacy mode " << dendl;
+ return -ENOTSUP;
+ }
+ buf->reset();
+
+ {
+ std::lock_guard l(vstatfs_lock);
+ osd_pools[pool_id].publish(buf);
+ }
+
+ string key_prefix;
+ _key_encode_u64(pool_id, &key_prefix);
+ *out_per_pool_omap = per_pool_omap != OMAP_BULK;
+ // stop calls after db was closed
+ if (*out_per_pool_omap && db) {
+ auto prefix = per_pool_omap == OMAP_PER_POOL ?
+ PREFIX_PERPOOL_OMAP :
+ PREFIX_PERPG_OMAP;
+ buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
+ }
+
+ dout(10) << __func__ << *buf << dendl;
+ return 0;
+}
+
+void BlueStore::_check_legacy_statfs_alert()
+{
+ string s;
+ if (!per_pool_stat_collection &&
+ cct->_conf->bluestore_warn_on_legacy_statfs) {
+ s = "legacy statfs reporting detected, "
+ "suggest to run store repair to get consistent statistic reports";
+ }
+ std::lock_guard l(qlock);
+ legacy_statfs_alert = s;
+}
+
+void BlueStore::_check_no_per_pg_or_pool_omap_alert()
+{
+ string per_pg, per_pool;
+ if (per_pool_omap != OMAP_PER_PG) {
+ if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
+ per_pg = "legacy (not per-pg) omap detected, "
+ "suggest to run store repair to benefit from faster PG removal";
+ }
+ if (per_pool_omap != OMAP_PER_POOL) {
+ if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
+ per_pool = "legacy (not per-pool) omap detected, "
+ "suggest to run store repair to benefit from per-pool omap usage statistics";
+ }
+ }
+ }
+ std::lock_guard l(qlock);
+ no_per_pg_omap_alert = per_pg;
+ no_per_pool_omap_alert = per_pool;
+}
+
+// ---------------
+// cache
+
+BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
+{
+ std::shared_lock l(coll_lock);
+ ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return CollectionRef();
+ return cp->second;
+}
+
+BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
+{
+ std::shared_lock l(coll_lock);
+
+ // FIXME: we must replace this with something more efficient
+
+ for (auto& i : coll_map) {
+ spg_t spgid;
+ if (i.first.is_pg(&spgid) &&
+ i.second->contains(oid)) {
+ return i.second;
+ }
+ }
+ return CollectionRef();
+}
+
+void BlueStore::_queue_reap_collection(CollectionRef& c)
+{
+ dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+ // _reap_collections and this in the same thread,
+ // so no need a lock.
+ removed_collections.push_back(c);
+}
+
+void BlueStore::_reap_collections()
+{
+
+ list<CollectionRef> removed_colls;
+ {
+ // _queue_reap_collection and this in the same thread.
+ // So no need a lock.
+ if (!removed_collections.empty())
+ removed_colls.swap(removed_collections);
+ else
+ return;
+ }
+
+ list<CollectionRef>::iterator p = removed_colls.begin();
+ while (p != removed_colls.end()) {
+ CollectionRef c = *p;
+ dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+ if (c->onode_space.map_any([&](Onode* o) {
+ ceph_assert(!o->exists);
+ if (o->flushing_count.load()) {
+ dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
+ << " flush_txns " << o->flushing_count << dendl;
+ return true;
+ }
+ return false;
+ })) {
+ ++p;
+ continue;
+ }
+ c->onode_space.clear();
+ p = removed_colls.erase(p);
+ dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
+ }
+ if (removed_colls.empty()) {
+ dout(10) << __func__ << " all reaped" << dendl;
+ } else {
+ removed_collections.splice(removed_collections.begin(), removed_colls);
+ }
+}
+
+void BlueStore::_update_logger()
+{
+ uint64_t num_onodes = 0;
+ uint64_t num_pinned_onodes = 0;
+ uint64_t num_extents = 0;
+ uint64_t num_blobs = 0;
+ uint64_t num_buffers = 0;
+ uint64_t num_buffer_bytes = 0;
+ for (auto c : onode_cache_shards) {
+ c->add_stats(&num_onodes, &num_pinned_onodes);
+ }
+ for (auto c : buffer_cache_shards) {
+ c->add_stats(&num_extents, &num_blobs,
+ &num_buffers, &num_buffer_bytes);
+ }
+ logger->set(l_bluestore_onodes, num_onodes);
+ logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
+ logger->set(l_bluestore_extents, num_extents);
+ logger->set(l_bluestore_blobs, num_blobs);
+ logger->set(l_bluestore_buffers, num_buffers);
+ logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
+}
+
+// ---------------
+// read operations
+
+ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
+{
+ return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle BlueStore::create_new_collection(
+ const coll_t& cid)
+{
+ std::unique_lock l{coll_lock};
+ auto c = ceph::make_ref<Collection>(
+ this,
+ onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+ buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
+ cid);
+ new_coll_map[cid] = c;
+ _osr_attach(c.get());
+ return c;
+}
+
+void BlueStore::set_collection_commit_queue(
+ const coll_t& cid,
+ ContextQueue *commit_queue)
+{
+ if (commit_queue) {
+ std::shared_lock l(coll_lock);
+ if (coll_map.count(cid)) {
+ coll_map[cid]->commit_queue = commit_queue;
+ } else if (new_coll_map.count(cid)) {
+ new_coll_map[cid]->commit_queue = commit_queue;
+ }
+ }
+}
+
+
+bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return false;
+
+ bool r = true;
+
+ {
+ std::shared_lock l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists)
+ r = false;
+ }
+
+ return r;
+}
+
+int BlueStore::stat(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ if (!c->exists)
+ return -ENOENT;
+ dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+
+ {
+ std::shared_lock l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists)
+ return -ENOENT;
+ st->st_size = o->onode.size;
+ st->st_blksize = 4096;
+ st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+ st->st_nlink = 1;
+ }
+
+ int r = 0;
+ if (_debug_mdata_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ }
+ return r;
+}
+int BlueStore::set_collection_opts(
+ CollectionHandle& ch,
+ const pool_opts_t& opts)
+{
+ Collection *c = static_cast<Collection *>(ch.get());
+ dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ std::unique_lock l{c->lock};
+ c->pool_opts = opts;
+ return 0;
+}
+
+int BlueStore::read(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ auto start = mono_clock::now();
+ Collection *c = static_cast<Collection *>(c_.get());
+ const coll_t &cid = c->get_cid();
+ dout(15) << __func__ << " " << cid << " " << oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ bl.clear();
+ int r;
+ {
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ OnodeRef o = c->get_onode(oid, false);
+ log_latency("get_onode@read",
+ l_bluestore_read_onode_meta_lat,
+ mono_clock::now() - start1,
+ cct->_conf->bluestore_log_op_age);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (offset == length && offset == 0)
+ length = o->onode.size;
+
+ r = _do_read(c, o, offset, length, bl, op_flags);
+ if (r == -EIO) {
+ logger->inc(l_bluestore_read_eio);
+ }
+ }
+
+ out:
+ if (r >= 0 && _debug_data_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
+ cct->_conf->bluestore_debug_random_read_err &&
+ (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+ 100.0)) == 0) {
+ dout(0) << __func__ << ": inject random EIO" << dendl;
+ r = -EIO;
+ }
+ dout(10) << __func__ << " " << cid << " " << oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ log_latency(__func__,
+ l_bluestore_read_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ return r;
+}
+
+void BlueStore::_read_cache(
+ OnodeRef& o,
+ uint64_t offset,
+ size_t length,
+ int read_cache_policy,
+ ready_regions_t& ready_regions,
+ blobs2read_t& blobs2read)
+{
+ // build blob-wise list to of stuff read (that isn't cached)
+ unsigned left = length;
+ uint64_t pos = offset;
+ auto lp = o->extent_map.seek_lextent(offset);
+ while (left > 0 && lp != o->extent_map.extent_map.end()) {
+ if (pos < lp->logical_offset) {
+ unsigned hole = lp->logical_offset - pos;
+ if (hole >= left) {
+ break;
+ }
+ dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
+ << std::dec << dendl;
+ pos += hole;
+ left -= hole;
+ }
+ BlobRef& bptr = lp->blob;
+ unsigned l_off = pos - lp->logical_offset;
+ unsigned b_off = l_off + lp->blob_offset;
+ unsigned b_len = std::min(left, lp->length - l_off);
+
+ ready_regions_t cache_res;
+ interval_set<uint32_t> cache_interval;
+ bptr->shared_blob->bc.read(
+ bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
+ read_cache_policy);
+ dout(20) << __func__ << " blob " << *bptr << std::hex
+ << " need 0x" << b_off << "~" << b_len
+ << " cache has 0x" << cache_interval
+ << std::dec << dendl;
+
+ auto pc = cache_res.begin();
+ uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
+ while (b_len > 0) {
+ unsigned l;
+ if (pc != cache_res.end() &&
+ pc->first == b_off) {
+ l = pc->second.length();
+ ready_regions[pos] = std::move(pc->second);
+ dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
+ << b_off << "~" << l << std::dec << dendl;
+ ++pc;
+ } else {
+ l = b_len;
+ if (pc != cache_res.end()) {
+ ceph_assert(pc->first > b_off);
+ l = pc->first - b_off;
+ }
+ dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
+ << b_off << "~" << l << std::dec << dendl;
+ // merge regions
+ {
+ uint64_t r_off = b_off;
+ uint64_t r_len = l;
+ uint64_t front = r_off % chunk_size;
+ if (front) {
+ r_off -= front;
+ r_len += front;
+ }
+ unsigned tail = r_len % chunk_size;
+ if (tail) {
+ r_len += chunk_size - tail;
+ }
+ bool merged = false;
+ regions2read_t& r2r = blobs2read[bptr];
+ if (r2r.size()) {
+ read_req_t& pre = r2r.back();
+ if (r_off <= (pre.r_off + pre.r_len)) {
+ front += (r_off - pre.r_off);
+ pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
+ pre.regs.emplace_back(region_t(pos, b_off, l, front));
+ merged = true;
+ }
+ }
+ if (!merged) {
+ read_req_t req(r_off, r_len);
+ req.regs.emplace_back(region_t(pos, b_off, l, front));
+ r2r.emplace_back(std::move(req));
+ }
+ }
+ }
+ pos += l;
+ b_off += l;
+ left -= l;
+ b_len -= l;
+ }
+ ++lp;
+ }
+}
+
+int BlueStore::_prepare_read_ioc(
+ blobs2read_t& blobs2read,
+ vector<bufferlist>* compressed_blob_bls,
+ IOContext* ioc)
+{
+ for (auto& p : blobs2read) {
+ const BlobRef& bptr = p.first;
+ regions2read_t& r2r = p.second;
+ dout(20) << __func__ << " blob " << *bptr << " need "
+ << r2r << dendl;
+ if (bptr->get_blob().is_compressed()) {
+ // read the whole thing
+ if (compressed_blob_bls->empty()) {
+ // ensure we avoid any reallocation on subsequent blobs
+ compressed_blob_bls->reserve(blobs2read.size());
+ }
+ compressed_blob_bls->push_back(bufferlist());
+ bufferlist& bl = compressed_blob_bls->back();
+ auto r = bptr->get_blob().map(
+ 0, bptr->get_blob().get_ondisk_length(),
+ [&](uint64_t offset, uint64_t length) {
+ int r = bdev->aio_read(offset, length, &bl, ioc);
+ if (r < 0)
+ return r;
+ return 0;
+ });
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
+ ceph_assert(r == 0);
+ }
+ } else {
+ // read the pieces
+ for (auto& req : r2r) {
+ dout(20) << __func__ << " region 0x" << std::hex
+ << req.regs.front().logical_offset
+ << ": 0x" << req.regs.front().blob_xoffset
+ << " reading 0x" << req.r_off
+ << "~" << req.r_len << std::dec
+ << dendl;
+
+ // read it
+ auto r = bptr->get_blob().map(
+ req.r_off, req.r_len,
+ [&](uint64_t offset, uint64_t length) {
+ int r = bdev->aio_read(offset, length, &req.bl, ioc);
+ if (r < 0)
+ return r;
+ return 0;
+ });
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
+ << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
+ ceph_assert(r == 0);
+ }
+ ceph_assert(req.bl.length() == req.r_len);
+ }
+ }
+ }
+ return 0;
+}
+
+int BlueStore::_generate_read_result_bl(
+ OnodeRef& o,
+ uint64_t offset,
+ size_t length,
+ ready_regions_t& ready_regions,
+ vector<bufferlist>& compressed_blob_bls,
+ blobs2read_t& blobs2read,
+ bool buffered,
+ bool* csum_error,
+ bufferlist& bl)
+{
+ // enumerate and decompress desired blobs
+ auto p = compressed_blob_bls.begin();
+ blobs2read_t::iterator b2r_it = blobs2read.begin();
+ while (b2r_it != blobs2read.end()) {
+ const BlobRef& bptr = b2r_it->first;
+ regions2read_t& r2r = b2r_it->second;
+ dout(20) << __func__ << " blob " << *bptr << " need "
+ << r2r << dendl;
+ if (bptr->get_blob().is_compressed()) {
+ ceph_assert(p != compressed_blob_bls.end());
+ bufferlist& compressed_bl = *p++;
+ if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
+ r2r.front().regs.front().logical_offset) < 0) {
+ *csum_error = true;
+ return -EIO;
+ }
+ bufferlist raw_bl;
+ auto r = _decompress(compressed_bl, &raw_bl);
+ if (r < 0)
+ return r;
+ if (buffered) {
+ bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
+ raw_bl);
+ }
+ for (auto& req : r2r) {
+ for (auto& r : req.regs) {
+ ready_regions[r.logical_offset].substr_of(
+ raw_bl, r.blob_xoffset, r.length);
+ }
+ }
+ } else {
+ for (auto& req : r2r) {
+ if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
+ req.regs.front().logical_offset) < 0) {
+ *csum_error = true;
+ return -EIO;
+ }
+ if (buffered) {
+ bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
+ req.r_off, req.bl);
+ }
+
+ // prune and keep result
+ for (const auto& r : req.regs) {
+ ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
+ }
+ }
+ }
+ ++b2r_it;
+ }
+
+ // generate a resulting buffer
+ auto pr = ready_regions.begin();
+ auto pr_end = ready_regions.end();
+ uint64_t pos = 0;
+ while (pos < length) {
+ if (pr != pr_end && pr->first == pos + offset) {
+ dout(30) << __func__ << " assemble 0x" << std::hex << pos
+ << ": data from 0x" << pr->first << "~" << pr->second.length()
+ << std::dec << dendl;
+ pos += pr->second.length();
+ bl.claim_append(pr->second);
+ ++pr;
+ } else {
+ uint64_t l = length - pos;
+ if (pr != pr_end) {
+ ceph_assert(pr->first > pos + offset);
+ l = pr->first - (pos + offset);
+ }
+ dout(30) << __func__ << " assemble 0x" << std::hex << pos
+ << ": zeros for 0x" << (pos + offset) << "~" << l
+ << std::dec << dendl;
+ bl.append_zero(l);
+ pos += l;
+ }
+ }
+ ceph_assert(bl.length() == length);
+ ceph_assert(pos == length);
+ ceph_assert(pr == pr_end);
+ return 0;
+}
+
+int BlueStore::_do_read(
+ Collection *c,
+ OnodeRef& o,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl,
+ uint32_t op_flags,
+ uint64_t retry_count)
+{
+ FUNCTRACE(cct);
+ int r = 0;
+ int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " size 0x" << o->onode.size << " (" << std::dec
+ << o->onode.size << ")" << dendl;
+ bl.clear();
+
+ if (offset >= o->onode.size) {
+ return r;
+ }
+
+ // generally, don't buffer anything, unless the client explicitly requests
+ // it.
+ bool buffered = false;
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered read" << dendl;
+ buffered = true;
+ } else if (cct->_conf->bluestore_default_buffered_read &&
+ (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+ dout(20) << __func__ << " defaulting to buffered read" << dendl;
+ buffered = true;
+ }
+
+ if (offset + length > o->onode.size) {
+ length = o->onode.size - offset;
+ }
+
+ auto start = mono_clock::now();
+ o->extent_map.fault_range(db, offset, length);
+ log_latency(__func__,
+ l_bluestore_read_onode_meta_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ _dump_onode<30>(cct, *o);
+
+ // for deep-scrub, we only read dirty cache and bypass clean cache in
+ // order to read underlying block device in case there are silent disk errors.
+ if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
+ dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
+ read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
+ }
+
+ // build blob-wise list to of stuff read (that isn't cached)
+ ready_regions_t ready_regions;
+ blobs2read_t blobs2read;
+ _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
+
+
+ // read raw blob data.
+ start = mono_clock::now(); // for the sake of simplicity
+ // measure the whole block below.
+ // The error isn't that much...
+ vector<bufferlist> compressed_blob_bls;
+ IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
+ r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
+ // we always issue aio for reading, so errors other than EIO are not allowed
+ if (r < 0)
+ return r;
+
+ int64_t num_ios = blobs2read.size();
+ if (ioc.has_pending_aios()) {
+ num_ios = ioc.get_num_ios();
+ bdev->aio_submit(&ioc);
+ dout(20) << __func__ << " waiting for aio" << dendl;
+ ioc.aio_wait();
+ r = ioc.get_return_value();
+ if (r < 0) {
+ ceph_assert(r == -EIO); // no other errors allowed
+ return -EIO;
+ }
+ }
+ log_latency_fn(__func__,
+ l_bluestore_read_wait_aio_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age,
+ [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+ );
+
+ bool csum_error = false;
+ r = _generate_read_result_bl(o, offset, length, ready_regions,
+ compressed_blob_bls, blobs2read,
+ buffered && !ioc.skip_cache(),
+ &csum_error, bl);
+ if (csum_error) {
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most
+ // cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
+ }
+ r = bl.length();
+ if (retry_count) {
+ logger->inc(l_bluestore_reads_with_retries);
+ dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+ << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+ stringstream s;
+ s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
+ _set_spurious_read_errors_alert(s.str());
+ }
+ return r;
+}
+
+int BlueStore::_verify_csum(OnodeRef& o,
+ const bluestore_blob_t* blob, uint64_t blob_xoffset,
+ const bufferlist& bl,
+ uint64_t logical_offset) const
+{
+ int bad;
+ uint64_t bad_csum;
+ auto start = mono_clock::now();
+ int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+ if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+ (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+ derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+ bad = blob_xoffset;
+ r = -1;
+ bad_csum = 0xDEADBEEF;
+ }
+ if (r < 0) {
+ if (r == -1) {
+ PExtentVector pex;
+ blob->map(
+ bad,
+ blob->get_csum_chunk_size(),
+ [&](uint64_t offset, uint64_t length) {
+ pex.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ derr << __func__ << " bad "
+ << Checksummer::get_csum_type_string(blob->csum_type)
+ << "/0x" << std::hex << blob->get_csum_chunk_size()
+ << " checksum at blob offset 0x" << bad
+ << ", got 0x" << bad_csum << ", expected 0x"
+ << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
+ << ", device location " << pex
+ << ", logical extent 0x" << std::hex
+ << (logical_offset + bad - blob_xoffset) << "~"
+ << blob->get_csum_chunk_size() << std::dec
+ << ", object " << o->oid
+ << dendl;
+ } else {
+ derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
+ }
+ }
+ log_latency(__func__,
+ l_bluestore_csum_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ if (cct->_conf->bluestore_ignore_data_csum) {
+ return 0;
+ }
+ return r;
+}
+
+int BlueStore::_decompress(bufferlist& source, bufferlist* result)
+{
+ int r = 0;
+ auto start = mono_clock::now();
+ auto i = source.cbegin();
+ bluestore_compression_header_t chdr;
+ decode(chdr, i);
+ int alg = int(chdr.type);
+ CompressorRef cp = compressor;
+ if (!cp || (int)cp->get_type() != alg) {
+ cp = Compressor::create(cct, alg);
+ }
+
+ if (!cp.get()) {
+ // if compressor isn't available - error, because cannot return
+ // decompressed data?
+
+ const char* alg_name = Compressor::get_comp_alg_name(alg);
+ derr << __func__ << " can't load decompressor " << alg_name << dendl;
+ _set_compression_alert(false, alg_name);
+ r = -EIO;
+ } else {
+ r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
+ if (r < 0) {
+ derr << __func__ << " decompression failed with exit code " << r << dendl;
+ r = -EIO;
+ }
+ }
+ log_latency(__func__,
+ l_bluestore_decompress_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ return r;
+}
+
+// this stores fiemap into interval_set, other variations
+// use it internally
+int BlueStore::_fiemap(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ interval_set<uint64_t>& destset)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ if (!c->exists)
+ return -ENOENT;
+ {
+ std::shared_lock l(c->lock);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ return -ENOENT;
+ }
+ _dump_onode<30>(cct, *o);
+
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " size 0x" << o->onode.size << std::dec << dendl;
+
+ boost::intrusive::set<Extent>::iterator ep, eend;
+ if (offset >= o->onode.size)
+ goto out;
+
+ if (offset + length > o->onode.size) {
+ length = o->onode.size - offset;
+ }
+
+ o->extent_map.fault_range(db, offset, length);
+ eend = o->extent_map.extent_map.end();
+ ep = o->extent_map.seek_lextent(offset);
+ while (length > 0) {
+ dout(20) << __func__ << " offset " << offset << dendl;
+ if (ep != eend && ep->logical_offset + ep->length <= offset) {
+ ++ep;
+ continue;
+ }
+
+ uint64_t x_len = length;
+ if (ep != eend && ep->logical_offset <= offset) {
+ uint64_t x_off = offset - ep->logical_offset;
+ x_len = std::min(x_len, ep->length - x_off);
+ dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
+ << x_len << std::dec << " blob " << ep->blob << dendl;
+ destset.insert(offset, x_len);
+ length -= x_len;
+ offset += x_len;
+ if (x_off + x_len == ep->length)
+ ++ep;
+ continue;
+ }
+ if (ep != eend &&
+ ep->logical_offset > offset &&
+ ep->logical_offset - offset < x_len) {
+ x_len = ep->logical_offset - offset;
+ }
+ offset += x_len;
+ length -= x_len;
+ }
+ }
+
+ out:
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " size = 0x(" << destset << ")" << std::dec << dendl;
+ return 0;
+}
+
+int BlueStore::fiemap(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl)
+{
+ interval_set<uint64_t> m;
+ int r = _fiemap(c_, oid, offset, length, m);
+ if (r >= 0) {
+ encode(m, bl);
+ }
+ return r;
+}
+
+int BlueStore::fiemap(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ map<uint64_t, uint64_t>& destmap)
+{
+ interval_set<uint64_t> m;
+ int r = _fiemap(c_, oid, offset, length, m);
+ if (r >= 0) {
+ destmap = std::move(m).detach();
+ }
+ return r;
+}
+
+int BlueStore::readv(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ auto start = mono_clock::now();
+ Collection *c = static_cast<Collection *>(c_.get());
+ const coll_t &cid = c->get_cid();
+ dout(15) << __func__ << " " << cid << " " << oid
+ << " fiemap " << m
+ << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ bl.clear();
+ int r;
+ {
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ OnodeRef o = c->get_onode(oid, false);
+ log_latency("get_onode@read",
+ l_bluestore_read_onode_meta_lat,
+ mono_clock::now() - start1,
+ cct->_conf->bluestore_log_op_age);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (m.empty()) {
+ r = 0;
+ goto out;
+ }
+
+ r = _do_readv(c, o, m, bl, op_flags);
+ if (r == -EIO) {
+ logger->inc(l_bluestore_read_eio);
+ }
+ }
+
+ out:
+ if (r >= 0 && _debug_data_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
+ cct->_conf->bluestore_debug_random_read_err &&
+ (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+ 100.0)) == 0) {
+ dout(0) << __func__ << ": inject random EIO" << dendl;
+ r = -EIO;
+ }
+ dout(10) << __func__ << " " << cid << " " << oid
+ << " fiemap " << m << std::dec
+ << " = " << r << dendl;
+ log_latency(__func__,
+ l_bluestore_read_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ return r;
+}
+
+int BlueStore::_do_readv(
+ Collection *c,
+ OnodeRef& o,
+ const interval_set<uint64_t>& m,
+ bufferlist& bl,
+ uint32_t op_flags,
+ uint64_t retry_count)
+{
+ FUNCTRACE(cct);
+ int r = 0;
+ int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+ dout(20) << __func__ << " fiemap " << m << std::hex
+ << " size 0x" << o->onode.size << " (" << std::dec
+ << o->onode.size << ")" << dendl;
+
+ // generally, don't buffer anything, unless the client explicitly requests
+ // it.
+ bool buffered = false;
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered read" << dendl;
+ buffered = true;
+ } else if (cct->_conf->bluestore_default_buffered_read &&
+ (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+ dout(20) << __func__ << " defaulting to buffered read" << dendl;
+ buffered = true;
+ }
+ // this method must be idempotent since we may call it several times
+ // before we finally read the expected result.
+ bl.clear();
+
+ // call fiemap first!
+ ceph_assert(m.range_start() <= o->onode.size);
+ ceph_assert(m.range_end() <= o->onode.size);
+ auto start = mono_clock::now();
+ o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
+ log_latency(__func__,
+ l_bluestore_read_onode_meta_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ _dump_onode<30>(cct, *o);
+
+ IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
+ vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
+ raw_results.reserve(m.num_intervals());
+ int i = 0;
+ for (auto p = m.begin(); p != m.end(); p++, i++) {
+ raw_results.push_back({});
+ _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
+ std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
+ r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
+ // we always issue aio for reading, so errors other than EIO are not allowed
+ if (r < 0)
+ return r;
+ }
+
+ auto num_ios = m.size();
+ if (ioc.has_pending_aios()) {
+ num_ios = ioc.get_num_ios();
+ bdev->aio_submit(&ioc);
+ dout(20) << __func__ << " waiting for aio" << dendl;
+ ioc.aio_wait();
+ r = ioc.get_return_value();
+ if (r < 0) {
+ ceph_assert(r == -EIO); // no other errors allowed
+ return -EIO;
+ }
+ }
+ log_latency_fn(__func__,
+ l_bluestore_read_wait_aio_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age,
+ [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+ );
+
+ ceph_assert(raw_results.size() == (size_t)m.num_intervals());
+ i = 0;
+ for (auto p = m.begin(); p != m.end(); p++, i++) {
+ bool csum_error = false;
+ bufferlist t;
+ r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
+ std::get<0>(raw_results[i]),
+ std::get<1>(raw_results[i]),
+ std::get<2>(raw_results[i]),
+ buffered, &csum_error, t);
+ if (csum_error) {
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most
+ // cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
+ }
+ bl.claim_append(t);
+ }
+ if (retry_count) {
+ logger->inc(l_bluestore_reads_with_retries);
+ dout(5) << __func__ << " read fiemap " << m
+ << " failed " << retry_count << " times before succeeding"
+ << dendl;
+ }
+ return bl.length();
+}
+
+int BlueStore::dump_onode(CollectionHandle &c_,
+ const ghobject_t& oid,
+ const string& section_name,
+ Formatter *f)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ int r;
+ {
+ std::shared_lock l(c->lock);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ // FIXME minor: actually the next line isn't enough to
+ // load shared blobs. Leaving as is for now..
+ //
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+ _dump_onode<0>(cct, *o);
+ f->open_object_section(section_name.c_str());
+ o->dump(f);
+ f->close_section();
+ r = 0;
+ }
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oid
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::getattr(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ const char *name,
+ bufferptr& value)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ int r;
+ {
+ std::shared_lock l(c->lock);
+ mempool::bluestore_cache_meta::string k(name);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (!o->onode.attrs.count(k)) {
+ r = -ENODATA;
+ goto out;
+ }
+ value = o->onode.attrs[k];
+ r = 0;
+ }
+ out:
+ if (r == 0 && _debug_mdata_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ }
+ dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::getattrs(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ map<string,bufferptr,less<>>& aset)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ int r;
+ {
+ std::shared_lock l(c->lock);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ for (auto& i : o->onode.attrs) {
+ aset.emplace(i.first.c_str(), i.second);
+ }
+ r = 0;
+ }
+
+ out:
+ if (r == 0 && _debug_mdata_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ }
+ dout(10) << __func__ << " " << c->cid << " " << oid
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::list_collections(vector<coll_t>& ls)
+{
+ std::shared_lock l(coll_lock);
+ ls.reserve(coll_map.size());
+ for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+ p != coll_map.end();
+ ++p)
+ ls.push_back(p->first);
+ return 0;
+}
+
+bool BlueStore::collection_exists(const coll_t& c)
+{
+ std::shared_lock l(coll_lock);
+ return coll_map.count(c);
+}
+
+int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+ dout(15) << __func__ << " " << ch->cid << dendl;
+ vector<ghobject_t> ls;
+ ghobject_t next;
+ int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+ &ls, &next);
+ if (r < 0) {
+ derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ *empty = ls.empty();
+ dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+ return 0;
+}
+
+int BlueStore::collection_bits(CollectionHandle& ch)
+{
+ dout(15) << __func__ << " " << ch->cid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ std::shared_lock l(c->lock);
+ dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+ return c->cnode.bits;
+}
+
+int BlueStore::collection_list(
+ CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ c->flush();
+ dout(15) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max << dendl;
+ int r;
+ {
+ std::shared_lock l(c->lock);
+ r = _collection_list(c, start, end, max, false, ls, pnext);
+ }
+
+ dout(10) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max
+ << " = " << r << ", ls.size() = " << ls->size()
+ << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
+ return r;
+}
+
+int BlueStore::collection_list_legacy(
+ CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ c->flush();
+ dout(15) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max << dendl;
+ int r;
+ {
+ std::shared_lock l(c->lock);
+ r = _collection_list(c, start, end, max, true, ls, pnext);
+ }
+
+ dout(10) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max
+ << " = " << r << ", ls.size() = " << ls->size()
+ << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
+ return r;
+}
+
+int BlueStore::_collection_list(
+ Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
+ bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+
+ if (!c->exists)
+ return -ENOENT;
+
+ ghobject_t static_next;
+ std::unique_ptr<CollectionListIterator> it;
+ ghobject_t coll_range_temp_start, coll_range_temp_end;
+ ghobject_t coll_range_start, coll_range_end;
+ ghobject_t pend;
+ bool temp;
+
+ if (!pnext)
+ pnext = &static_next;
+
+ auto log_latency = make_scope_guard(
+ [&, start_time = mono_clock::now(), func_name = __func__] {
+ log_latency_fn(
+ func_name,
+ l_bluestore_clist_lat,
+ mono_clock::now() - start_time,
+ cct->_conf->bluestore_log_collection_list_age,
+ [&](const ceph::timespan& lat) {
+ ostringstream ostr;
+ ostr << ", lat = " << timespan_str(lat)
+ << " cid =" << c->cid
+ << " start " << start << " end " << end
+ << " max " << max;
+ return ostr.str();
+ });
+ });
+
+ if (start.is_max() || start.hobj.is_max()) {
+ *pnext = ghobject_t::get_max();
+ return 0;
+ }
+ get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
+ &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
+ dout(20) << __func__
+ << " range " << coll_range_temp_start
+ << " to " << coll_range_temp_end
+ << " and " << coll_range_start
+ << " to " << coll_range_end
+ << " start " << start << dendl;
+ if (legacy) {
+ it = std::make_unique<SimpleCollectionListIterator>(
+ cct, db->get_iterator(PREFIX_OBJ));
+ } else {
+ it = std::make_unique<SortedCollectionListIterator>(
+ db->get_iterator(PREFIX_OBJ));
+ }
+ if (start == ghobject_t() ||
+ start.hobj == hobject_t() ||
+ start == c->cid.get_min_hobj()) {
+ it->upper_bound(coll_range_temp_start);
+ temp = true;
+ } else {
+ if (start.hobj.is_temp()) {
+ temp = true;
+ ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
+ } else {
+ temp = false;
+ ceph_assert(start >= coll_range_start && start < coll_range_end);
+ }
+ dout(20) << __func__ << " temp=" << (int)temp << dendl;
+ it->lower_bound(start);
+ }
+ if (end.hobj.is_max()) {
+ pend = temp ? coll_range_temp_end : coll_range_end;
+ } else {
+ if (end.hobj.is_temp()) {
+ if (temp) {
+ pend = end;
+ } else {
+ *pnext = ghobject_t::get_max();
+ return 0;
+ }
+ } else {
+ pend = temp ? coll_range_temp_end : end;
+ }
+ }
+ dout(20) << __func__ << " pend " << pend << dendl;
+ while (true) {
+ if (!it->valid() || it->is_ge(pend)) {
+ if (!it->valid())
+ dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+ else
+ dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
+ if (temp) {
+ if (end.hobj.is_temp()) {
+ if (it->valid() && it->is_lt(coll_range_temp_end)) {
+ *pnext = it->oid();
+ return 0;
+ }
+ break;
+ }
+ dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+ temp = false;
+ it->upper_bound(coll_range_start);
+ if (end.hobj.is_max())
+ pend = coll_range_end;
+ else
+ pend = end;
+ dout(30) << __func__ << " pend " << pend << dendl;
+ continue;
+ }
+ if (it->valid() && it->is_lt(coll_range_end)) {
+ *pnext = it->oid();
+ return 0;
+ }
+ break;
+ }
+ dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
+ if (ls->size() >= (unsigned)max) {
+ dout(20) << __func__ << " reached max " << max << dendl;
+ *pnext = it->oid();
+ return 0;
+ }
+ ls->push_back(it->oid());
+ it->next();
+ }
+ *pnext = ghobject_t::get_max();
+ return 0;
+}
+
+int BlueStore::omap_get(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ return _omap_get(c, oid, header, out);
+}
+
+int BlueStore::_omap_get(
+ Collection *c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ )
+{
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ std::shared_lock l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ r = _onode_omap_get(o, header, out);
+ out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::_onode_omap_get(
+ const OnodeRef &o, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+)
+{
+ int r = 0;
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ o->flush();
+ {
+ const string& prefix = o->get_omap_prefix();
+ string head, tail;
+ o->get_omap_header(&head);
+ o->get_omap_tail(&tail);
+ KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() == head) {
+ dout(30) << __func__ << " got header" << dendl;
+ *header = it->value();
+ } else if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ } else {
+ string user_key;
+ o->decode_omap_key(it->key(), &user_key);
+ dout(20) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+ (*out)[user_key] = it->value();
+ }
+ it->next();
+ }
+ }
+out:
+ return r;
+}
+
+int BlueStore::omap_get_header(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio ///< [in] don't assert on eio
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ std::shared_lock l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ o->flush();
+ {
+ string head;
+ o->get_omap_header(&head);
+ if (db->get(o->get_omap_prefix(), head, header) >= 0) {
+ dout(30) << __func__ << " got header" << dendl;
+ } else {
+ dout(30) << __func__ << " no header" << dendl;
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::omap_get_keys(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ auto start1 = mono_clock::now();
+ std::shared_lock l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ o->flush();
+ {
+ const string& prefix = o->get_omap_prefix();
+ string head, tail;
+ o->get_omap_key(string(), &head);
+ o->get_omap_tail(&tail);
+ KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ }
+ string user_key;
+ o->decode_omap_key(it->key(), &user_key);
+ dout(20) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+ keys->insert(user_key);
+ it->next();
+ }
+ }
+ out:
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_get_keys_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::omap_get_values(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ std::shared_lock l(c->lock);
+ auto start1 = mono_clock::now();
+ int r = 0;
+ string final_key;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ o->flush();
+ {
+ const string& prefix = o->get_omap_prefix();
+ o->get_omap_key(string(), &final_key);
+ size_t base_key_len = final_key.size();
+ for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+ final_key.resize(base_key_len); // keep prefix
+ final_key += *p;
+ bufferlist val;
+ if (db->get(prefix, final_key, &val) >= 0) {
+ dout(30) << __func__ << " got " << pretty_binary_string(final_key)
+ << " -> " << *p << dendl;
+ out->insert(make_pair(*p, val));
+ }
+ }
+ }
+ out:
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_get_values_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+#ifdef WITH_SEASTAR
+int BlueStore::omap_get_values(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const std::optional<string> &start_after, ///< [in] Keys to get
+ map<string, bufferlist> *output ///< [out] Returned keys and values
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ std::shared_lock l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ o->flush();
+ {
+ ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
+ if (!iter) {
+ r = -ENOENT;
+ goto out;
+ }
+ iter->upper_bound(*start_after);
+ for (; iter->valid(); iter->next()) {
+ output->insert(make_pair(iter->key(), iter->value()));
+ }
+ }
+
+out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+#endif
+
+int BlueStore::omap_check_keys(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ std::shared_lock l(c->lock);
+ int r = 0;
+ string final_key;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ o->flush();
+ {
+ const string& prefix = o->get_omap_prefix();
+ o->get_omap_key(string(), &final_key);
+ size_t base_key_len = final_key.size();
+ for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+ final_key.resize(base_key_len); // keep prefix
+ final_key += *p;
+ bufferlist val;
+ if (db->get(prefix, final_key, &val) >= 0) {
+ dout(30) << __func__ << " have " << pretty_binary_string(final_key)
+ << " -> " << *p << dendl;
+ out->insert(*p);
+ } else {
+ dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
+ << " -> " << *p << dendl;
+ }
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
+ CollectionHandle &c_, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+ if (!c->exists) {
+ return ObjectMap::ObjectMapIterator();
+ }
+ std::shared_lock l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ o->flush();
+ dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
+ auto bounds = KeyValueDB::IteratorBounds();
+ if (o->onode.has_omap()) {
+ std::string lower_bound, upper_bound;
+ o->get_omap_key(string(), &lower_bound);
+ o->get_omap_tail(&upper_bound);
+ bounds.lower_bound = std::move(lower_bound);
+ bounds.upper_bound = std::move(upper_bound);
+ }
+ KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
+ return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it));
+}
+
+// -----------------
+// write helpers
+
+uint64_t BlueStore::_get_ondisk_reserved() const {
+ ceph_assert(min_alloc_size);
+ return round_up_to(
+ std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
+}
+
+void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
+{
+ dout(10) << __func__ << " ondisk_format " << ondisk_format
+ << " min_compat_ondisk_format " << min_compat_ondisk_format
+ << dendl;
+ ceph_assert(ondisk_format == latest_ondisk_format);
+ {
+ bufferlist bl;
+ encode(ondisk_format, bl);
+ t->set(PREFIX_SUPER, "ondisk_format", bl);
+ }
+ {
+ bufferlist bl;
+ encode(min_compat_ondisk_format, bl);
+ t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
+ }
+}
+
+int BlueStore::_open_super_meta()
+{
+ // nid
+ {
+ nid_max = 0;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "nid_max", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t v;
+ decode(v, p);
+ nid_max = v;
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to read nid_max" << dendl;
+ return -EIO;
+ }
+ dout(1) << __func__ << " old nid_max " << nid_max << dendl;
+ nid_last = nid_max.load();
+ }
+
+ // blobid
+ {
+ blobid_max = 0;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "blobid_max", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t v;
+ decode(v, p);
+ blobid_max = v;
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to read blobid_max" << dendl;
+ return -EIO;
+ }
+ dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
+ blobid_last = blobid_max.load();
+ }
+
+ // freelist
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "freelist_type", &bl);
+ if (bl.length()) {
+ freelist_type = std::string(bl.c_str(), bl.length());
+ } else {
+ ceph_abort_msg("Not Support extent freelist manager");
+ }
+ dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
+ }
+ // ondisk format
+ int32_t compat_ondisk_format = 0;
+ {
+ bufferlist bl;
+ int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
+ if (r < 0) {
+ // base case: kraken bluestore is v1 and readable by v1
+ dout(20) << __func__ << " missing ondisk_format; assuming kraken"
+ << dendl;
+ ondisk_format = 1;
+ compat_ondisk_format = 1;
+ } else {
+ auto p = bl.cbegin();
+ try {
+ decode(ondisk_format, p);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to read ondisk_format" << dendl;
+ return -EIO;
+ }
+ bl.clear();
+ {
+ r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
+ ceph_assert(!r);
+ auto p = bl.cbegin();
+ try {
+ decode(compat_ondisk_format, p);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to read compat_ondisk_format" << dendl;
+ return -EIO;
+ }
+ }
+ }
+ dout(1) << __func__ << " ondisk_format " << ondisk_format
+ << " compat_ondisk_format " << compat_ondisk_format
+ << dendl;
+ }
+
+ if (latest_ondisk_format < compat_ondisk_format) {
+ derr << __func__ << " compat_ondisk_format is "
+ << compat_ondisk_format << " but we only understand version "
+ << latest_ondisk_format << dendl;
+ return -EPERM;
+ }
+
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "min_alloc_size", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t val;
+ decode(val, p);
+ min_alloc_size = val;
+ min_alloc_size_order = std::countr_zero(val);
+ min_alloc_size_mask = min_alloc_size - 1;
+
+ ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to read min_alloc_size" << dendl;
+ return -EIO;
+ }
+ dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+ << std::dec << dendl;
+ logger->set(l_bluestore_alloc_unit, min_alloc_size);
+ }
+
+ // smr fields
+ {
+ bufferlist bl;
+ int r = db->get(PREFIX_SUPER, "zone_size", &bl);
+ if (r >= 0) {
+ auto p = bl.cbegin();
+ decode(zone_size, p);
+ dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
+ ceph_assert(bdev->is_smr());
+ } else {
+ ceph_assert(!bdev->is_smr());
+ }
+ }
+ {
+ bufferlist bl;
+ int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
+ if (r >= 0) {
+ auto p = bl.cbegin();
+ decode(first_sequential_zone, p);
+ dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
+ << first_sequential_zone << std::dec << dendl;
+ ceph_assert(bdev->is_smr());
+ } else {
+ ceph_assert(!bdev->is_smr());
+ }
+ }
+
+ _set_per_pool_omap();
+
+ _open_statfs();
+ _set_alloc_sizes();
+ _set_throttle_params();
+
+ _set_csum();
+ _set_compression();
+ _set_blob_size();
+
+ _validate_bdev();
+ return 0;
+}
+
+int BlueStore::_upgrade_super()
+{
+ dout(1) << __func__ << " from " << ondisk_format << ", latest "
+ << latest_ondisk_format << dendl;
+ if (ondisk_format < latest_ondisk_format) {
+ ceph_assert(ondisk_format > 0);
+ ceph_assert(ondisk_format < latest_ondisk_format);
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (ondisk_format == 1) {
+ // changes:
+ // - super: added ondisk_format
+ // - super: added min_readable_ondisk_format
+ // - super: added min_compat_ondisk_format
+ // - super: added min_alloc_size
+ // - super: removed min_min_alloc_size
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t val;
+ decode(val, p);
+ min_alloc_size = val;
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to read min_min_alloc_size" << dendl;
+ return -EIO;
+ }
+ t->set(PREFIX_SUPER, "min_alloc_size", bl);
+ t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
+ }
+ ondisk_format = 2;
+ }
+ if (ondisk_format == 2) {
+ // changes:
+ // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
+ // oondes are using the per-pool prefix until a repair is run; at that
+ // point the per_pool_omap=1 key will be set.
+ // - super: added per_pool_omap key, which indicates that *all* objects
+ // are using the new prefix and key format
+ ondisk_format = 3;
+ }
+ if (ondisk_format == 3) {
+ // changes:
+ // - FreelistManager keeps meta within bdev label
+ int r = _write_out_fm_meta(0);
+ ceph_assert(r == 0);
+ ondisk_format = 4;
+ }
+ // This to be the last operation
+ _prepare_ondisk_format_super(t);
+ int r = db->submit_transaction_sync(t);
+ ceph_assert(r == 0);
+ }
+ // done
+ dout(1) << __func__ << " done" << dendl;
+ return 0;
+}
+
+void BlueStore::_assign_nid(TransContext *txc, OnodeRef& o)
+{
+ if (o->onode.nid) {
+ ceph_assert(o->exists);
+ return;
+ }
+ uint64_t nid = ++nid_last;
+ dout(20) << __func__ << " " << nid << dendl;
+ o->onode.nid = nid;
+ txc->last_nid = nid;
+ o->exists = true;
+}
+
+uint64_t BlueStore::_assign_blobid(TransContext *txc)
+{
+ uint64_t bid = ++blobid_last;
+ dout(20) << __func__ << " " << bid << dendl;
+ txc->last_blobid = bid;
+ return bid;
+}
+
+void BlueStore::get_db_statistics(Formatter *f)
+{
+ db->get_statistics(f);
+}
+
+BlueStore::TransContext *BlueStore::_txc_create(
+ Collection *c, OpSequencer *osr,
+ list<Context*> *on_commits,
+ TrackedOpRef osd_op)
+{
+ TransContext *txc = new TransContext(cct, c, osr, on_commits);
+ txc->t = db->get_transaction();
+
+#ifdef WITH_BLKIN
+ if (osd_op && osd_op->pg_trace) {
+ txc->trace.init("TransContext", &trace_endpoint,
+ &osd_op->pg_trace);
+ txc->trace.event("txc create");
+ txc->trace.keyval("txc seq", txc->seq);
+ }
+#endif
+
+ osr->queue_new(txc);
+ dout(20) << __func__ << " osr " << osr << " = " << txc
+ << " seq " << txc->seq << dendl;
+ return txc;
+}
+
+void BlueStore::_txc_calc_cost(TransContext *txc)
+{
+ // one "io" for the kv commit
+ auto ios = 1 + txc->ioc.get_num_ios();
+ auto cost = throttle_cost_per_io.load();
+ txc->cost = ios * cost + txc->bytes;
+ txc->ios = ios;
+ dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
+ << ios << " ios * " << cost << " + " << txc->bytes
+ << " bytes)" << dendl;
+}
+
+void BlueStore::_txc_update_store_statfs(TransContext *txc)
+{
+ if (txc->statfs_delta.is_empty())
+ return;
+
+ logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
+ logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
+ logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
+ logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
+ logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
+
+ if (per_pool_stat_collection) {
+ if (!is_statfs_recoverable()) {
+ bufferlist bl;
+ txc->statfs_delta.encode(bl);
+ string key;
+ get_pool_stat_key(txc->osd_pool_id, &key);
+ txc->t->merge(PREFIX_STAT, key, bl);
+ }
+
+ std::lock_guard l(vstatfs_lock);
+ auto& stats = osd_pools[txc->osd_pool_id];
+ stats += txc->statfs_delta;
+
+ vstatfs += txc->statfs_delta; //non-persistent in this mode
+
+ } else {
+ if (!is_statfs_recoverable()) {
+ bufferlist bl;
+ txc->statfs_delta.encode(bl);
+ txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ }
+
+ std::lock_guard l(vstatfs_lock);
+ vstatfs += txc->statfs_delta;
+ }
+ txc->statfs_delta.reset();
+}
+
+void BlueStore::_txc_state_proc(TransContext *txc)
+{
+ while (true) {
+ dout(10) << __func__ << " txc " << txc
+ << " " << txc->get_state_name() << dendl;
+ switch (txc->get_state()) {
+ case TransContext::STATE_PREPARE:
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
+ if (txc->ioc.has_pending_aios()) {
+ txc->set_state(TransContext::STATE_AIO_WAIT);
+#ifdef WITH_BLKIN
+ if (txc->trace) {
+ txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
+ }
+#endif
+ txc->had_ios = true;
+ _txc_aio_submit(txc);
+ return;
+ }
+ // ** fall-thru **
+
+ case TransContext::STATE_AIO_WAIT:
+ {
+ mono_clock::duration lat = throttle.log_state_latency(
+ *txc, logger, l_bluestore_state_aio_wait_lat);
+ if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
+ dout(0) << __func__ << " slow aio_wait, txc = " << txc
+ << ", latency = " << lat
+ << dendl;
+ }
+ }
+
+ _txc_finish_io(txc); // may trigger blocked txc's too
+ return;
+
+ case TransContext::STATE_IO_DONE:
+ ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
+ if (txc->had_ios) {
+ ++txc->osr->txc_with_unstable_io;
+ }
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
+ txc->set_state(TransContext::STATE_KV_QUEUED);
+ if (cct->_conf->bluestore_sync_submit_transaction) {
+ if (txc->last_nid >= nid_max ||
+ txc->last_blobid >= blobid_max) {
+ dout(20) << __func__
+ << " last_{nid,blobid} exceeds max, submit via kv thread"
+ << dendl;
+ } else if (txc->osr->kv_committing_serially) {
+ dout(20) << __func__ << " prior txc submitted via kv thread, us too"
+ << dendl;
+ // note: this is starvation-prone. once we have a txc in a busy
+ // sequencer that is committing serially it is possible to keep
+ // submitting new transactions fast enough that we get stuck doing
+ // so. the alternative is to block here... fixme?
+ } else if (txc->osr->txc_with_unstable_io) {
+ dout(20) << __func__ << " prior txc(s) with unstable ios "
+ << txc->osr->txc_with_unstable_io.load() << dendl;
+ } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
+ rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
+ == 0) {
+ dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
+ << dendl;
+ } else {
+ _txc_apply_kv(txc, true);
+ }
+ }
+ {
+ std::lock_guard l(kv_lock);
+ kv_queue.push_back(txc);
+ if (!kv_sync_in_progress) {
+ kv_sync_in_progress = true;
+ kv_cond.notify_one();
+ }
+ if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
+ kv_queue_unsubmitted.push_back(txc);
+ ++txc->osr->kv_committing_serially;
+ }
+ if (txc->had_ios)
+ kv_ios++;
+ kv_throttle_costs += txc->cost;
+ }
+ return;
+ case TransContext::STATE_KV_SUBMITTED:
+ _txc_committed_kv(txc);
+ // ** fall-thru **
+
+ case TransContext::STATE_KV_DONE:
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
+ if (txc->deferred_txn) {
+ txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
+ _deferred_queue(txc);
+ return;
+ }
+ txc->set_state(TransContext::STATE_FINISHING);
+ break;
+
+ case TransContext::STATE_DEFERRED_CLEANUP:
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
+ txc->set_state(TransContext::STATE_FINISHING);
+ // ** fall-thru **
+
+ case TransContext::STATE_FINISHING:
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
+ _txc_finish(txc);
+ return;
+
+ default:
+ derr << __func__ << " unexpected txc " << txc
+ << " state " << txc->get_state_name() << dendl;
+ ceph_abort_msg("unexpected txc state");
+ return;
+ }
+ }
+}
+
+void BlueStore::_txc_finish_io(TransContext *txc)
+{
+ dout(20) << __func__ << " " << txc << dendl;
+
+ /*
+ * we need to preserve the order of kv transactions,
+ * even though aio will complete in any order.
+ */
+
+ OpSequencer *osr = txc->osr.get();
+ std::lock_guard l(osr->qlock);
+ txc->set_state(TransContext::STATE_IO_DONE);
+ txc->ioc.release_running_aios();
+ OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
+ while (p != osr->q.begin()) {
+ --p;
+ if (p->get_state() < TransContext::STATE_IO_DONE) {
+ dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
+ << p->get_state_name() << dendl;
+ return;
+ }
+ if (p->get_state() > TransContext::STATE_IO_DONE) {
+ ++p;
+ break;
+ }
+ }
+ do {
+ _txc_state_proc(&*p++);
+ } while (p != osr->q.end() &&
+ p->get_state() == TransContext::STATE_IO_DONE);
+
+ if (osr->kv_submitted_waiters) {
+ osr->qcond.notify_all();
+ }
+}
+
+void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
+{
+ dout(20) << __func__ << " txc " << txc
+ << " onodes " << txc->onodes
+ << " shared_blobs " << txc->shared_blobs
+ << dendl;
+
+ // finalize onodes
+ for (auto o : txc->onodes) {
+ _record_onode(o, t);
+ o->flushing_count++;
+ }
+
+ // objects we modified but didn't affect the onode
+ auto p = txc->modified_objects.begin();
+ while (p != txc->modified_objects.end()) {
+ if (txc->onodes.count(*p) == 0) {
+ (*p)->flushing_count++;
+ ++p;
+ } else {
+ // remove dups with onodes list to avoid problems in _txc_finish
+ p = txc->modified_objects.erase(p);
+ }
+ }
+
+ // finalize shared_blobs
+ for (auto sb : txc->shared_blobs) {
+ string key;
+ auto sbid = sb->get_sbid();
+ get_shared_blob_key(sbid, &key);
+ if (sb->persistent->empty()) {
+ dout(20) << __func__ << " shared_blob 0x"
+ << std::hex << sbid << std::dec
+ << " is empty" << dendl;
+ t->rmkey(PREFIX_SHARED_BLOB, key);
+ } else {
+ bufferlist bl;
+ encode(*(sb->persistent), bl);
+ dout(20) << __func__ << " shared_blob 0x"
+ << std::hex << sbid << std::dec
+ << " is " << bl.length() << " " << *sb << dendl;
+ t->set(PREFIX_SHARED_BLOB, key, bl);
+ }
+ }
+}
+
+void BlueStore::BSPerfTracker::update_from_perfcounters(
+ PerfCounters &logger)
+{
+ os_commit_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_bluestore_commit_lat));
+ os_apply_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_bluestore_commit_lat));
+}
+
+void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
+{
+ dout(20) << __func__ << " txc " << txc << std::hex
+ << " allocated 0x" << txc->allocated
+ << " released 0x" << txc->released
+ << std::dec << dendl;
+
+ if (!fm->is_null_manager())
+ {
+ // We have to handle the case where we allocate *and* deallocate the
+ // same region in this transaction. The freelist doesn't like that.
+ // (Actually, the only thing that cares is the BitmapFreelistManager
+ // debug check. But that's important.)
+ interval_set<uint64_t> tmp_allocated, tmp_released;
+ interval_set<uint64_t> *pallocated = &txc->allocated;
+ interval_set<uint64_t> *preleased = &txc->released;
+ if (!txc->allocated.empty() && !txc->released.empty()) {
+ interval_set<uint64_t> overlap;
+ overlap.intersection_of(txc->allocated, txc->released);
+ if (!overlap.empty()) {
+ tmp_allocated = txc->allocated;
+ tmp_allocated.subtract(overlap);
+ tmp_released = txc->released;
+ tmp_released.subtract(overlap);
+ dout(20) << __func__ << " overlap 0x" << std::hex << overlap
+ << ", new allocated 0x" << tmp_allocated
+ << " released 0x" << tmp_released << std::dec
+ << dendl;
+ pallocated = &tmp_allocated;
+ preleased = &tmp_released;
+ }
+ }
+
+ // update freelist with non-overlap sets
+ for (interval_set<uint64_t>::iterator p = pallocated->begin();
+ p != pallocated->end();
+ ++p) {
+ fm->allocate(p.get_start(), p.get_len(), t);
+ }
+ for (interval_set<uint64_t>::iterator p = preleased->begin();
+ p != preleased->end();
+ ++p) {
+ dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
+ << "~" << p.get_len() << std::dec << dendl;
+ fm->release(p.get_start(), p.get_len(), t);
+ }
+ }
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ for (auto& i : txc->old_zone_offset_refs) {
+ dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
+ << " offset 0x" << i.second << std::dec
+ << " -> " << i.first.first->oid << dendl;
+ string key;
+ get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
+ txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
+ }
+ for (auto& i : txc->new_zone_offset_refs) {
+ // (zone, offset) -> oid
+ dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
+ << " offset 0x" << i.second << std::dec
+ << " -> " << i.first.first->oid << dendl;
+ string key;
+ get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
+ bufferlist v;
+ txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
+ }
+ }
+#endif
+
+ _txc_update_store_statfs(txc);
+}
+
+void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
+{
+ ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
+ {
+#if defined(WITH_LTTNG)
+ auto start = mono_clock::now();
+#endif
+
+#ifdef WITH_BLKIN
+ if (txc->trace) {
+ txc->trace.event("db async submit");
+ }
+#endif
+
+ int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
+ ceph_assert(r == 0);
+ txc->set_state(TransContext::STATE_KV_SUBMITTED);
+ if (txc->osr->kv_submitted_waiters) {
+ std::lock_guard l(txc->osr->qlock);
+ txc->osr->qcond.notify_all();
+ }
+
+#if defined(WITH_LTTNG)
+ if (txc->tracing) {
+ tracepoint(
+ bluestore,
+ transaction_kv_submit_latency,
+ txc->osr->get_sequencer_id(),
+ txc->seq,
+ sync_submit_transaction,
+ ceph::to_seconds<double>(mono_clock::now() - start));
+ }
+#endif
+ }
+
+ for (auto ls : { &txc->onodes, &txc->modified_objects }) {
+ for (auto& o : *ls) {
+ dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
+ << dendl;
+ if (--o->flushing_count == 0 && o->waiting_count.load()) {
+ std::lock_guard l(o->flush_lock);
+ o->flush_cond.notify_all();
+ }
+ }
+ }
+}
+
+void BlueStore::_txc_committed_kv(TransContext *txc)
+{
+ dout(20) << __func__ << " txc " << txc << dendl;
+ throttle.complete_kv(*txc);
+ {
+ std::lock_guard l(txc->osr->qlock);
+ txc->set_state(TransContext::STATE_KV_DONE);
+ if (txc->ch->commit_queue) {
+ txc->ch->commit_queue->queue(txc->oncommits);
+ } else {
+ finisher.queue(txc->oncommits);
+ }
+ }
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
+ log_latency_fn(
+ __func__,
+ l_bluestore_commit_lat,
+ mono_clock::now() - txc->start,
+ cct->_conf->bluestore_log_op_age,
+ [&](auto lat) {
+ return ", txc = " + stringify(txc);
+ }
+ );
+}
+
+void BlueStore::_txc_finish(TransContext *txc)
+{
+ dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+ ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
+
+ for (auto& sb : txc->shared_blobs_written) {
+ sb->finish_write(txc->seq);
+ }
+ txc->shared_blobs_written.clear();
+
+ while (!txc->removed_collections.empty()) {
+ _queue_reap_collection(txc->removed_collections.front());
+ txc->removed_collections.pop_front();
+ }
+
+ OpSequencerRef osr = txc->osr;
+ bool empty = false;
+ bool submit_deferred = false;
+ OpSequencer::q_list_t releasing_txc;
+ {
+ std::lock_guard l(osr->qlock);
+ txc->set_state(TransContext::STATE_DONE);
+ bool notify = false;
+ while (!osr->q.empty()) {
+ TransContext *txc = &osr->q.front();
+ dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
+ << dendl;
+ if (txc->get_state() != TransContext::STATE_DONE) {
+ if (txc->get_state() == TransContext::STATE_PREPARE &&
+ deferred_aggressive) {
+ // for _osr_drain_preceding()
+ notify = true;
+ }
+ if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
+ osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
+ submit_deferred = true;
+ }
+ break;
+ }
+
+ osr->q.pop_front();
+ releasing_txc.push_back(*txc);
+ }
+
+ if (osr->q.empty()) {
+ dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+ empty = true;
+ }
+
+ // only drain()/drain_preceding() need wakeup,
+ // other cases use kv_submitted_waiters
+ if (notify || empty) {
+ osr->qcond.notify_all();
+ }
+ }
+
+ while (!releasing_txc.empty()) {
+ // release to allocator only after all preceding txc's have also
+ // finished any deferred writes that potentially land in these
+ // blocks
+ auto txc = &releasing_txc.front();
+ _txc_release_alloc(txc);
+ releasing_txc.pop_front();
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
+ throttle.complete(*txc);
+ delete txc;
+ }
+
+ if (submit_deferred) {
+ // we're pinning memory; flush! we could be more fine-grained here but
+ // i'm not sure it's worth the bother.
+ deferred_try_submit();
+ }
+
+ if (empty && osr->zombie) {
+ std::lock_guard l(zombie_osr_lock);
+ if (zombie_osr_set.erase(osr->cid)) {
+ dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+ } else {
+ dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
+ << dendl;
+ }
+ }
+}
+
+void BlueStore::_txc_release_alloc(TransContext *txc)
+{
+ bool discard_queued = false;
+ // it's expected we're called with lazy_release_lock already taken!
+ if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks)) {
+ goto out;
+ }
+ discard_queued = bdev->try_discard(txc->released);
+ // if async discard succeeded, will do alloc->release when discard callback
+ // else we should release here
+ if (!discard_queued) {
+ dout(10) << __func__ << "(sync) " << txc << " " << std::hex
+ << txc->released << std::dec << dendl;
+ alloc->release(txc->released);
+ }
+
+out:
+ txc->allocated.clear();
+ txc->released.clear();
+}
+
+void BlueStore::_osr_attach(Collection *c)
+{
+ // note: caller has coll_lock
+ auto q = coll_map.find(c->cid);
+ if (q != coll_map.end()) {
+ c->osr = q->second->osr;
+ ldout(cct, 10) << __func__ << " " << c->cid
+ << " reusing osr " << c->osr << " from existing coll "
+ << q->second << dendl;
+ } else {
+ std::lock_guard l(zombie_osr_lock);
+ auto p = zombie_osr_set.find(c->cid);
+ if (p == zombie_osr_set.end()) {
+ c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
+ ldout(cct, 10) << __func__ << " " << c->cid
+ << " fresh osr " << c->osr << dendl;
+ } else {
+ c->osr = p->second;
+ zombie_osr_set.erase(p);
+ ldout(cct, 10) << __func__ << " " << c->cid
+ << " resurrecting zombie osr " << c->osr << dendl;
+ c->osr->zombie = false;
+ }
+ }
+}
+
+void BlueStore::_osr_register_zombie(OpSequencer *osr)
+{
+ std::lock_guard l(zombie_osr_lock);
+ dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
+ osr->zombie = true;
+ auto i = zombie_osr_set.emplace(osr->cid, osr);
+ // this is either a new insertion or the same osr is already there
+ ceph_assert(i.second || i.first->second == osr);
+}
+
+void BlueStore::_osr_drain_preceding(TransContext *txc)
+{
+ OpSequencer *osr = txc->osr.get();
+ dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
+ ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+ {
+ // submit anything pending
+ osr->deferred_lock.lock();
+ if (osr->deferred_pending && !osr->deferred_running) {
+ _deferred_submit_unlock(osr);
+ } else {
+ osr->deferred_lock.unlock();
+ }
+ }
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ if (!kv_sync_in_progress) {
+ kv_sync_in_progress = true;
+ kv_cond.notify_one();
+ }
+ }
+ osr->drain_preceding(txc);
+ --deferred_aggressive;
+ dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain(OpSequencer *osr)
+{
+ dout(10) << __func__ << " " << osr << dendl;
+ ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+ {
+ // submit anything pending
+ osr->deferred_lock.lock();
+ if (osr->deferred_pending && !osr->deferred_running) {
+ _deferred_submit_unlock(osr);
+ } else {
+ osr->deferred_lock.unlock();
+ }
+ }
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ if (!kv_sync_in_progress) {
+ kv_sync_in_progress = true;
+ kv_cond.notify_one();
+ }
+ }
+ osr->drain();
+ --deferred_aggressive;
+ dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain_all()
+{
+ dout(10) << __func__ << dendl;
+
+ set<OpSequencerRef> s;
+ vector<OpSequencerRef> zombies;
+ {
+ std::shared_lock l(coll_lock);
+ for (auto& i : coll_map) {
+ s.insert(i.second->osr);
+ }
+ }
+ {
+ std::lock_guard l(zombie_osr_lock);
+ for (auto& i : zombie_osr_set) {
+ s.insert(i.second);
+ zombies.push_back(i.second);
+ }
+ }
+ dout(20) << __func__ << " osr_set " << s << dendl;
+
+ ++deferred_aggressive;
+ {
+ // submit anything pending
+ deferred_try_submit();
+ }
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ kv_cond.notify_one();
+ }
+ {
+ std::lock_guard l(kv_finalize_lock);
+ kv_finalize_cond.notify_one();
+ }
+ for (auto osr : s) {
+ dout(20) << __func__ << " drain " << osr << dendl;
+ osr->drain();
+ }
+ --deferred_aggressive;
+
+ {
+ std::lock_guard l(zombie_osr_lock);
+ for (auto& osr : zombies) {
+ if (zombie_osr_set.erase(osr->cid)) {
+ dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+ ceph_assert(osr->q.empty());
+ } else if (osr->zombie) {
+ dout(10) << __func__ << " empty zombie osr " << osr
+ << " already reaped" << dendl;
+ ceph_assert(osr->q.empty());
+ } else {
+ dout(10) << __func__ << " empty zombie osr " << osr
+ << " resurrected" << dendl;
+ }
+ }
+ }
+
+ dout(10) << __func__ << " done" << dendl;
+}
+
+
+void BlueStore::_kv_start()
+{
+ dout(10) << __func__ << dendl;
+
+ finisher.start();
+ kv_sync_thread.create("bstore_kv_sync");
+ kv_finalize_thread.create("bstore_kv_final");
+}
+
+void BlueStore::_kv_stop()
+{
+ dout(10) << __func__ << dendl;
+ {
+ std::unique_lock l{kv_lock};
+ while (!kv_sync_started) {
+ kv_cond.wait(l);
+ }
+ kv_stop = true;
+ kv_cond.notify_all();
+ }
+ {
+ std::unique_lock l{kv_finalize_lock};
+ while (!kv_finalize_started) {
+ kv_finalize_cond.wait(l);
+ }
+ kv_finalize_stop = true;
+ kv_finalize_cond.notify_all();
+ }
+ kv_sync_thread.join();
+ kv_finalize_thread.join();
+ ceph_assert(removed_collections.empty());
+ {
+ std::lock_guard l(kv_lock);
+ kv_stop = false;
+ }
+ {
+ std::lock_guard l(kv_finalize_lock);
+ kv_finalize_stop = false;
+ }
+ dout(10) << __func__ << " stopping finishers" << dendl;
+ finisher.wait_for_empty();
+ finisher.stop();
+ dout(10) << __func__ << " stopped" << dendl;
+}
+
+void BlueStore::_kv_sync_thread()
+{
+ dout(10) << __func__ << " start" << dendl;
+ deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
+ std::unique_lock l{kv_lock};
+ ceph_assert(!kv_sync_started);
+ kv_sync_started = true;
+ kv_cond.notify_all();
+
+ auto t0 = mono_clock::now();
+ timespan twait = ceph::make_timespan(0);
+ size_t kv_submitted = 0;
+
+ while (true) {
+ auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
+ auto observation_period =
+ ceph::make_timespan(period);
+ auto elapsed = mono_clock::now() - t0;
+ if (period && elapsed >= observation_period) {
+ dout(5) << __func__ << " utilization: idle "
+ << twait << " of " << elapsed
+ << ", submitted: " << kv_submitted
+ <<dendl;
+ t0 = mono_clock::now();
+ twait = ceph::make_timespan(0);
+ kv_submitted = 0;
+ }
+ ceph_assert(kv_committing.empty());
+ if (kv_queue.empty() &&
+ ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
+ !deferred_aggressive)) {
+ if (kv_stop)
+ break;
+ dout(20) << __func__ << " sleep" << dendl;
+ auto t = mono_clock::now();
+ kv_sync_in_progress = false;
+ kv_cond.wait(l);
+ twait += mono_clock::now() - t;
+
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ deque<TransContext*> kv_submitting;
+ deque<DeferredBatch*> deferred_done, deferred_stable;
+ uint64_t aios = 0, costs = 0;
+
+ dout(20) << __func__ << " committing " << kv_queue.size()
+ << " submitting " << kv_queue_unsubmitted.size()
+ << " deferred done " << deferred_done_queue.size()
+ << " stable " << deferred_stable_queue.size()
+ << dendl;
+ kv_committing.swap(kv_queue);
+ kv_submitting.swap(kv_queue_unsubmitted);
+ deferred_done.swap(deferred_done_queue);
+ deferred_stable.swap(deferred_stable_queue);
+ aios = kv_ios;
+ costs = kv_throttle_costs;
+ kv_ios = 0;
+ kv_throttle_costs = 0;
+ l.unlock();
+
+ dout(30) << __func__ << " committing " << kv_committing << dendl;
+ dout(30) << __func__ << " submitting " << kv_submitting << dendl;
+ dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
+ dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+ auto start = mono_clock::now();
+
+ bool force_flush = false;
+ // if bluefs is sharing the same device as data (only), then we
+ // can rely on the bluefs commit to flush the device and make
+ // deferred aios stable. that means that if we do have done deferred
+ // txcs AND we are not on a single device, we need to force a flush.
+ if (bluefs && bluefs_layout.single_shared_device()) {
+ if (aios) {
+ force_flush = true;
+ } else if (kv_committing.empty() && deferred_stable.empty()) {
+ force_flush = true; // there's nothing else to commit!
+ } else if (deferred_aggressive) {
+ force_flush = true;
+ }
+ } else {
+ if (aios || !deferred_done.empty()) {
+ force_flush = true;
+ } else {
+ dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
+ }
+ }
+
+ if (force_flush) {
+ dout(20) << __func__ << " num_aios=" << aios
+ << " force_flush=" << (int)force_flush
+ << ", flushing, deferred done->stable" << dendl;
+ // flush/barrier on block device
+ bdev->flush();
+
+ // if we flush then deferred done are now deferred stable
+ if (deferred_stable.empty()) {
+ deferred_stable.swap(deferred_done);
+ } else {
+ deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
+ deferred_done.end());
+ deferred_done.clear();
+ }
+ }
+ auto after_flush = mono_clock::now();
+
+ // we will use one final transaction to force a sync
+ KeyValueDB::Transaction synct = db->get_transaction();
+
+ // increase {nid,blobid}_max? note that this covers both the
+ // case where we are approaching the max and the case we passed
+ // it. in either case, we increase the max in the earlier txn
+ // we submit.
+ uint64_t new_nid_max = 0, new_blobid_max = 0;
+ if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
+ KeyValueDB::Transaction t =
+ kv_submitting.empty() ? synct : kv_submitting.front()->t;
+ new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
+ bufferlist bl;
+ encode(new_nid_max, bl);
+ t->set(PREFIX_SUPER, "nid_max", bl);
+ dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
+ }
+ if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
+ KeyValueDB::Transaction t =
+ kv_submitting.empty() ? synct : kv_submitting.front()->t;
+ new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
+ bufferlist bl;
+ encode(new_blobid_max, bl);
+ t->set(PREFIX_SUPER, "blobid_max", bl);
+ dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
+ }
+
+ for (auto txc : kv_committing) {
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
+ if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
+ ++kv_submitted;
+ _txc_apply_kv(txc, false);
+ --txc->osr->kv_committing_serially;
+ } else {
+ ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
+ }
+ if (txc->had_ios) {
+ --txc->osr->txc_with_unstable_io;
+ }
+ }
+
+ // release throttle *before* we commit. this allows new ops
+ // to be prepared and enter pipeline while we are waiting on
+ // the kv commit sync/flush. then hopefully on the next
+ // iteration there will already be ops awake. otherwise, we
+ // end up going to sleep, and then wake up when the very first
+ // transaction is ready for commit.
+ throttle.release_kv_throttle(costs);
+
+ // cleanup sync deferred keys
+ for (auto b : deferred_stable) {
+ for (auto& txc : b->txcs) {
+ bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
+ ceph_assert(wt.released.empty()); // only kraken did this
+ string key;
+ get_deferred_key(wt.seq, &key);
+ synct->rm_single_key(PREFIX_DEFERRED, key);
+ }
+ }
+
+#if defined(WITH_LTTNG)
+ auto sync_start = mono_clock::now();
+#endif
+ // submit synct synchronously (block and wait for it to commit)
+ int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
+ ceph_assert(r == 0);
+
+#ifdef WITH_BLKIN
+ for (auto txc : kv_committing) {
+ if (txc->trace) {
+ txc->trace.event("db sync submit");
+ txc->trace.keyval("kv_committing size", kv_committing.size());
+ }
+ }
+#endif
+
+ int committing_size = kv_committing.size();
+ int deferred_size = deferred_stable.size();
+
+#if defined(WITH_LTTNG)
+ double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
+ for (auto txc: kv_committing) {
+ if (txc->tracing) {
+ tracepoint(
+ bluestore,
+ transaction_kv_sync_latency,
+ txc->osr->get_sequencer_id(),
+ txc->seq,
+ kv_committing.size(),
+ deferred_done.size(),
+ deferred_stable.size(),
+ sync_latency);
+ }
+ }
+#endif
+
+ {
+ std::unique_lock m{kv_finalize_lock};
+ if (kv_committing_to_finalize.empty()) {
+ kv_committing_to_finalize.swap(kv_committing);
+ } else {
+ kv_committing_to_finalize.insert(
+ kv_committing_to_finalize.end(),
+ kv_committing.begin(),
+ kv_committing.end());
+ kv_committing.clear();
+ }
+ if (deferred_stable_to_finalize.empty()) {
+ deferred_stable_to_finalize.swap(deferred_stable);
+ } else {
+ deferred_stable_to_finalize.insert(
+ deferred_stable_to_finalize.end(),
+ deferred_stable.begin(),
+ deferred_stable.end());
+ deferred_stable.clear();
+ }
+ if (!kv_finalize_in_progress) {
+ kv_finalize_in_progress = true;
+ kv_finalize_cond.notify_one();
+ }
+ }
+
+ if (new_nid_max) {
+ nid_max = new_nid_max;
+ dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+ }
+ if (new_blobid_max) {
+ blobid_max = new_blobid_max;
+ dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
+ }
+
+ {
+ auto finish = mono_clock::now();
+ ceph::timespan dur_flush = after_flush - start;
+ ceph::timespan dur_kv = finish - after_flush;
+ ceph::timespan dur = finish - start;
+ dout(20) << __func__ << " committed " << committing_size
+ << " cleaned " << deferred_size
+ << " in " << dur
+ << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
+ << dendl;
+ log_latency("kv_flush",
+ l_bluestore_kv_flush_lat,
+ dur_flush,
+ cct->_conf->bluestore_log_op_age);
+ log_latency("kv_commit",
+ l_bluestore_kv_commit_lat,
+ dur_kv,
+ cct->_conf->bluestore_log_op_age);
+ log_latency("kv_sync",
+ l_bluestore_kv_sync_lat,
+ dur,
+ cct->_conf->bluestore_log_op_age);
+ }
+
+ l.lock();
+ // previously deferred "done" are now "stable" by virtue of this
+ // commit cycle.
+ deferred_stable_queue.swap(deferred_done);
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+ kv_sync_started = false;
+}
+
+void BlueStore::_kv_finalize_thread()
+{
+ deque<TransContext*> kv_committed;
+ deque<DeferredBatch*> deferred_stable;
+ dout(10) << __func__ << " start" << dendl;
+ std::unique_lock l(kv_finalize_lock);
+ ceph_assert(!kv_finalize_started);
+ kv_finalize_started = true;
+ kv_finalize_cond.notify_all();
+ while (true) {
+ ceph_assert(kv_committed.empty());
+ ceph_assert(deferred_stable.empty());
+ if (kv_committing_to_finalize.empty() &&
+ deferred_stable_to_finalize.empty()) {
+ if (kv_finalize_stop)
+ break;
+ dout(20) << __func__ << " sleep" << dendl;
+ kv_finalize_in_progress = false;
+ kv_finalize_cond.wait(l);
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ kv_committed.swap(kv_committing_to_finalize);
+ deferred_stable.swap(deferred_stable_to_finalize);
+ l.unlock();
+ dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
+ dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+ auto start = mono_clock::now();
+
+ while (!kv_committed.empty()) {
+ TransContext *txc = kv_committed.front();
+ ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
+ _txc_state_proc(txc);
+ kv_committed.pop_front();
+ }
+
+ for (auto b : deferred_stable) {
+ auto p = b->txcs.begin();
+ while (p != b->txcs.end()) {
+ TransContext *txc = &*p;
+ p = b->txcs.erase(p); // unlink here because
+ _txc_state_proc(txc); // this may destroy txc
+ }
+ delete b;
+ }
+ deferred_stable.clear();
+
+ if (!deferred_aggressive) {
+ if (deferred_queue_size >= deferred_batch_ops.load() ||
+ throttle.should_submit_deferred()) {
+ deferred_try_submit();
+ }
+ }
+
+ // this is as good a place as any ...
+ _reap_collections();
+
+ logger->set(l_bluestore_fragmentation,
+ (uint64_t)(alloc->get_fragmentation() * 1000));
+
+ log_latency("kv_final",
+ l_bluestore_kv_final_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+
+ l.lock();
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+ kv_finalize_started = false;
+}
+
+#ifdef HAVE_LIBZBD
+void BlueStore::_zoned_cleaner_start()
+{
+ dout(10) << __func__ << dendl;
+ zoned_cleaner_thread.create("bstore_zcleaner");
+}
+
+void BlueStore::_zoned_cleaner_stop()
+{
+ dout(10) << __func__ << dendl;
+ {
+ std::unique_lock l{zoned_cleaner_lock};
+ while (!zoned_cleaner_started) {
+ zoned_cleaner_cond.wait(l);
+ }
+ zoned_cleaner_stop = true;
+ zoned_cleaner_cond.notify_all();
+ }
+ zoned_cleaner_thread.join();
+ {
+ std::lock_guard l{zoned_cleaner_lock};
+ zoned_cleaner_stop = false;
+ }
+ dout(10) << __func__ << " done" << dendl;
+}
+
+void BlueStore::_zoned_cleaner_thread()
+{
+ dout(10) << __func__ << " start" << dendl;
+ std::unique_lock l{zoned_cleaner_lock};
+ ceph_assert(!zoned_cleaner_started);
+ zoned_cleaner_started = true;
+ zoned_cleaner_cond.notify_all();
+ auto a = dynamic_cast<ZonedAllocator*>(alloc);
+ ceph_assert(a);
+ auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+ ceph_assert(f);
+ while (true) {
+ // thresholds to trigger cleaning
+ // FIXME
+ float min_score = .05; // score: bytes saved / bytes moved
+ uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
+ auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
+ if (zone_to_clean < 0) {
+ if (zoned_cleaner_stop) {
+ break;
+ }
+ auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
+ dout(20) << __func__ << " sleep for " << period << dendl;
+ zoned_cleaner_cond.wait_for(l, period);
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ l.unlock();
+ a->set_cleaning_zone(zone_to_clean);
+ _zoned_clean_zone(zone_to_clean, a, f);
+ a->clear_cleaning_zone(zone_to_clean);
+ l.lock();
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+ zoned_cleaner_started = false;
+}
+
+void BlueStore::_zoned_clean_zone(
+ uint64_t zone,
+ ZonedAllocator *a,
+ ZonedFreelistManager *f
+ )
+{
+ dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
+
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
+ std::string zone_start;
+ get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
+ for (it->lower_bound(zone_start); it->valid(); it->next()) {
+ uint32_t z;
+ uint64_t offset;
+ ghobject_t oid;
+ string k = it->key();
+ int r = get_key_zone_offset_object(k, &z, &offset, &oid);
+ if (r < 0) {
+ derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
+ << dendl;
+ continue;
+ }
+ if (zone != z) {
+ dout(10) << __func__ << " reached end of zone refs" << dendl;
+ break;
+ }
+ dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
+ << std::dec << " " << oid << dendl;
+ _clean_some(oid, zone);
+ }
+
+ if (a->get_live_bytes(zone) > 0) {
+ derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
+ << " live bytes" << std::dec << dendl;
+ // should we do something else here to avoid a live-lock in the event of a problem?
+ return;
+ }
+
+ // make sure transactions flush/drain/commit (and data is all rewritten
+ // safely elsewhere) before we blow away the cleaned zone
+ _osr_drain_all();
+
+ // reset the device zone
+ dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
+ bdev->reset_zone(zone);
+
+ // record that we can now write there
+ f->mark_zone_to_clean_free(zone, db);
+ bdev->flush();
+
+ // then allow ourselves to start allocating there
+ dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
+ << dendl;
+ a->reset_zone(zone);
+}
+
+void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
+{
+ dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
+ << dendl;
+
+ CollectionRef cref = _get_collection_by_oid(oid);
+ if (!cref) {
+ dout(10) << __func__ << " can't find collection for " << oid << dendl;
+ return;
+ }
+ Collection *c = cref.get();
+
+ // serialize io dispatch vs other transactions
+ std::lock_guard l(atomic_alloc_and_submit_lock);
+ std::unique_lock l2(c->lock);
+
+ auto o = c->get_onode(oid, false);
+ if (!o) {
+ dout(10) << __func__ << " can't find " << oid << dendl;
+ return;
+ }
+
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ _dump_onode<30>(cct, *o);
+
+ // NOTE: This is a naive rewrite strategy. If any blobs are
+ // shared, they will be duplicated for each object that references
+ // them. That means any cloned/snapshotted objects will explode
+ // their utilization. This won't matter for RGW workloads, but
+ // for RBD and CephFS it is completely unacceptable, and it's
+ // entirely reasonable to have "archival" data workloads on SMR
+ // for CephFS and (possibly/probably) RBD.
+ //
+ // At some point we need to replace this with something more
+ // sophisticated that ensures that a shared blob gets moved once
+ // and all referencing objects get updated to point to the new
+ // location.
+
+ map<uint32_t, uint32_t> to_move;
+ for (auto& e : o->extent_map.extent_map) {
+ bool touches_zone = false;
+ for (auto& be : e.blob->get_blob().get_extents()) {
+ if (be.is_valid()) {
+ uint32_t z = be.offset / zone_size;
+ if (z == zone) {
+ touches_zone = true;
+ break;
+ }
+ }
+ }
+ if (touches_zone) {
+ to_move[e.logical_offset] = e.length;
+ }
+ }
+ if (to_move.empty()) {
+ dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
+ << std::dec << " from " << oid << dendl;
+ return;
+ }
+
+ dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
+ << std::dec << dendl;
+ OpSequencer *osr = c->osr.get();
+ TransContext *txc = _txc_create(c, osr, nullptr);
+
+ spg_t pgid;
+ if (c->cid.is_pg(&pgid)) {
+ txc->osd_pool_id = pgid.pool();
+ }
+
+ for (auto& [offset, length] : to_move) {
+ bufferlist bl;
+ int r = _do_read(c, o, offset, length, bl, 0);
+ ceph_assert(r == (int)length);
+
+ r = _do_write(txc, cref, o, offset, length, bl, 0);
+ ceph_assert(r >= 0);
+ }
+ txc->write_onode(o);
+
+ _txc_write_nodes(txc, txc->t);
+ _txc_finalize_kv(txc, txc->t);
+ _txc_state_proc(txc);
+}
+#endif
+
+bluestore_deferred_op_t *BlueStore::_get_deferred_op(
+ TransContext *txc, uint64_t len)
+{
+ if (!txc->deferred_txn) {
+ txc->deferred_txn = new bluestore_deferred_transaction_t;
+ }
+ txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
+ logger->inc(l_bluestore_issued_deferred_writes);
+ logger->inc(l_bluestore_issued_deferred_write_bytes, len);
+ return &txc->deferred_txn->ops.back();
+}
+
+void BlueStore::_deferred_queue(TransContext *txc)
+{
+ dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
+
+ DeferredBatch *tmp;
+ txc->osr->deferred_lock.lock();
+ {
+ if (!txc->osr->deferred_pending) {
+ tmp = new DeferredBatch(cct, txc->osr.get());
+ } else {
+ tmp = txc->osr->deferred_pending;
+ }
+ }
+
+ tmp->txcs.push_back(*txc);
+ bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
+ for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
+ const auto& op = *opi;
+ ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
+ bufferlist::const_iterator p = op.data.begin();
+ for (auto e : op.extents) {
+ tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
+ }
+ }
+
+ {
+ ++deferred_queue_size;
+ txc->osr->deferred_pending = tmp;
+ // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
+ // So we should add osr into deferred_queue.
+ if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
+ deferred_lock.lock();
+ deferred_queue.push_back(*txc->osr);
+ deferred_lock.unlock();
+ }
+
+ if (deferred_aggressive &&
+ !txc->osr->deferred_running) {
+ _deferred_submit_unlock(txc->osr.get());
+ } else {
+ txc->osr->deferred_lock.unlock();
+ }
+ }
+ }
+
+void BlueStore::deferred_try_submit()
+{
+ dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
+ << deferred_queue_size << " txcs" << dendl;
+ vector<OpSequencerRef> osrs;
+
+ {
+ std::lock_guard l(deferred_lock);
+ osrs.reserve(deferred_queue.size());
+ for (auto& osr : deferred_queue) {
+ osrs.push_back(&osr);
+ }
+ }
+
+ for (auto& osr : osrs) {
+ osr->deferred_lock.lock();
+ if (osr->deferred_pending) {
+ if (!osr->deferred_running) {
+ _deferred_submit_unlock(osr.get());
+ } else {
+ osr->deferred_lock.unlock();
+ dout(20) << __func__ << " osr " << osr << " already has running"
+ << dendl;
+ }
+ } else {
+ osr->deferred_lock.unlock();
+ dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
+ }
+ }
+
+ {
+ std::lock_guard l(deferred_lock);
+ deferred_last_submitted = ceph_clock_now();
+ }
+}
+
+void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
+{
+ dout(10) << __func__ << " osr " << osr
+ << " " << osr->deferred_pending->iomap.size() << " ios pending "
+ << dendl;
+ ceph_assert(osr->deferred_pending);
+ ceph_assert(!osr->deferred_running);
+
+ auto b = osr->deferred_pending;
+ deferred_queue_size -= b->seq_bytes.size();
+ ceph_assert(deferred_queue_size >= 0);
+
+ osr->deferred_running = osr->deferred_pending;
+ osr->deferred_pending = nullptr;
+
+ osr->deferred_lock.unlock();
+
+ for (auto& txc : b->txcs) {
+ throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
+ }
+ uint64_t start = 0, pos = 0;
+ bufferlist bl;
+ auto i = b->iomap.begin();
+ while (true) {
+ if (i == b->iomap.end() || i->first != pos) {
+ if (bl.length()) {
+ dout(20) << __func__ << " write 0x" << std::hex
+ << start << "~" << bl.length()
+ << " crc " << bl.crc32c(-1) << std::dec << dendl;
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ logger->inc(l_bluestore_submitted_deferred_writes);
+ logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
+ int r = bdev->aio_write(start, bl, &b->ioc, false);
+ ceph_assert(r == 0);
+ }
+ }
+ if (i == b->iomap.end()) {
+ break;
+ }
+ start = 0;
+ pos = i->first;
+ bl.clear();
+ }
+ dout(20) << __func__ << " seq " << i->second.seq << " 0x"
+ << std::hex << pos << "~" << i->second.bl.length() << std::dec
+ << dendl;
+ if (!bl.length()) {
+ start = pos;
+ }
+ pos += i->second.bl.length();
+ bl.claim_append(i->second.bl);
+ ++i;
+ }
+
+ bdev->aio_submit(&b->ioc);
+}
+
+struct C_DeferredTrySubmit : public Context {
+ BlueStore *store;
+ C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+ void finish(int r) {
+ store->deferred_try_submit();
+ }
+};
+
+void BlueStore::_deferred_aio_finish(OpSequencer *osr)
+{
+ dout(10) << __func__ << " osr " << osr << dendl;
+ ceph_assert(osr->deferred_running);
+ DeferredBatch *b = osr->deferred_running;
+
+ {
+ osr->deferred_lock.lock();
+ ceph_assert(osr->deferred_running == b);
+ osr->deferred_running = nullptr;
+ if (!osr->deferred_pending) {
+ dout(20) << __func__ << " dequeueing" << dendl;
+ {
+ deferred_lock.lock();
+ auto q = deferred_queue.iterator_to(*osr);
+ deferred_queue.erase(q);
+ deferred_lock.unlock();
+ }
+ osr->deferred_lock.unlock();
+ } else {
+ osr->deferred_lock.unlock();
+ if (deferred_aggressive) {
+ dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
+ finisher.queue(new C_DeferredTrySubmit(this));
+ } else {
+ dout(20) << __func__ << " leaving queued, more pending" << dendl;
+ }
+ }
+ }
+
+ {
+ uint64_t costs = 0;
+ {
+ for (auto& i : b->txcs) {
+ TransContext *txc = &i;
+ throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
+ txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
+ costs += txc->cost;
+ }
+ }
+ throttle.release_deferred_throttle(costs);
+ }
+
+ {
+ std::lock_guard l(kv_lock);
+ deferred_done_queue.emplace_back(b);
+
+ // in the normal case, do not bother waking up the kv thread; it will
+ // catch us on the next commit anyway.
+ if (deferred_aggressive && !kv_sync_in_progress) {
+ kv_sync_in_progress = true;
+ kv_cond.notify_one();
+ }
+ }
+}
+
+int BlueStore::_deferred_replay()
+{
+ dout(10) << __func__ << " start" << dendl;
+ int count = 0;
+ int r = 0;
+ interval_set<uint64_t> bluefs_extents;
+ if (bluefs) {
+ bluefs->foreach_block_extents(
+ bluefs_layout.shared_bdev,
+ [&] (uint64_t start, uint32_t len) {
+ bluefs_extents.insert(start, len);
+ }
+ );
+ }
+ CollectionRef ch = _get_collection(coll_t::meta());
+ bool fake_ch = false;
+ if (!ch) {
+ // hmm, replaying initial mkfs?
+ ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
+ fake_ch = true;
+ }
+ OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
+ for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
+ dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
+ << dendl;
+ bluestore_deferred_transaction_t *deferred_txn =
+ new bluestore_deferred_transaction_t;
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ try {
+ decode(*deferred_txn, p);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to decode deferred txn "
+ << pretty_binary_string(it->key()) << dendl;
+ delete deferred_txn;
+ r = -EIO;
+ goto out;
+ }
+ bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
+ if (has_some) {
+ TransContext *txc = _txc_create(ch.get(), osr, nullptr);
+ txc->deferred_txn = deferred_txn;
+ txc->set_state(TransContext::STATE_KV_DONE);
+ _txc_state_proc(txc);
+ } else {
+ delete deferred_txn;
+ }
+ }
+ out:
+ dout(20) << __func__ << " draining osr" << dendl;
+ _osr_register_zombie(osr);
+ _osr_drain_all();
+ if (fake_ch) {
+ new_coll_map.clear();
+ }
+ dout(10) << __func__ << " completed " << count << " events" << dendl;
+ return r;
+}
+
+bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
+ interval_set<uint64_t>& bluefs_extents)
+{
+ bool has_some = false;
+ dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl;
+ auto it = deferred_txn->ops.begin();
+ while (it != deferred_txn->ops.end()) {
+ // We process a pair of _data_/_extents_ (here: it->data/it->extents)
+ // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_
+ // example:
+ // +------------+---------------+---------------+---------------+
+ // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff |
+ // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 |
+ // | in bluefs? | no | yes | no |
+ // +------------+---------------+---------------+---------------+
+ // result:
+ // +------------+---------------+---------------+
+ // | data | aaaaaaaabbbbb | ddddeeeeeefff |
+ // | extent | 40000 - 44000 | 58000 - 60000 |
+ // +------------+---------------+---------------+
+ PExtentVector new_extents;
+ ceph::buffer::list new_data;
+ uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data
+ dout(30) << __func__ << " input extents: " << it->extents << dendl;
+ for (auto& e: it->extents) {
+ interval_set<uint64_t> region;
+ region.insert(e.offset, e.length);
+
+ auto mi = bluefs_extents.lower_bound(e.offset);
+ if (mi != bluefs_extents.begin()) {
+ --mi;
+ if (mi.get_end() <= e.offset) {
+ ++mi;
+ }
+ }
+ while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) {
+ // The interval_set does not like (asserts) when we erase interval that does not exist.
+ // Hence we do we implement (region-mi) by ((region+mi)-mi).
+ region.union_insert(mi.get_start(), mi.get_len());
+ region.erase(mi.get_start(), mi.get_len());
+ ++mi;
+ }
+ // 'region' is now a subset of e, without parts used by bluefs
+ // we trim coresponding parts from it->data (actally constructing new_data / new_extents)
+ for (auto ki = region.begin(); ki != region.end(); ki++) {
+ ceph::buffer::list chunk;
+ // A chunk from it->data; data_offset is a an offset where 'e' was located;
+ // 'ki.get_start() - e.offset' is an offset of ki inside 'e'.
+ chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len());
+ new_data.claim_append(chunk);
+ new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len()));
+ }
+ data_offset += e.length;
+ }
+ dout(30) << __func__ << " output extents: " << new_extents << dendl;
+ if (it->data.length() != new_data.length()) {
+ dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl;
+ }
+ if (new_extents.size() == 0) {
+ it = deferred_txn->ops.erase(it);
+ } else {
+ has_some = true;
+ std::swap(it->extents, new_extents);
+ std::swap(it->data, new_data);
+ ++it;
+ }
+ }
+ return has_some;
+}
+
+// ---------------------------
+// transactions
+
+int BlueStore::queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op,
+ ThreadPool::TPHandle *handle)
+{
+ FUNCTRACE(cct);
+ list<Context *> on_applied, on_commit, on_applied_sync;
+ ObjectStore::Transaction::collect_contexts(
+ tls, &on_applied, &on_commit, &on_applied_sync);
+
+ auto start = mono_clock::now();
+
+ Collection *c = static_cast<Collection*>(ch.get());
+ OpSequencer *osr = c->osr.get();
+ dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
+
+ // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
+ // submission to happen atomically because if I/O submission happens in a
+ // different order than I/O allocation, we end up issuing non-sequential
+ // writes to the drive. This is a temporary solution until ZONE APPEND
+ // support matures in the kernel. For more information please see:
+ // https://www.usenix.org/conference/vault20/presentation/bjorling
+ if (bdev->is_smr()) {
+ atomic_alloc_and_submit_lock.lock();
+ }
+
+ // prepare
+ TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
+ &on_commit, op);
+
+ for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+ txc->bytes += (*p).get_num_bytes();
+ _txc_add_transaction(txc, &(*p));
+ }
+ _txc_calc_cost(txc);
+
+ _txc_write_nodes(txc, txc->t);
+
+ // journal deferred items
+ if (txc->deferred_txn) {
+ txc->deferred_txn->seq = ++deferred_seq;
+ bufferlist bl;
+ encode(*txc->deferred_txn, bl);
+ string key;
+ get_deferred_key(txc->deferred_txn->seq, &key);
+ txc->t->set(PREFIX_DEFERRED, key, bl);
+ }
+
+ _txc_finalize_kv(txc, txc->t);
+
+#ifdef WITH_BLKIN
+ if (txc->trace) {
+ txc->trace.event("txc encode finished");
+ }
+#endif
+
+ if (handle)
+ handle->suspend_tp_timeout();
+
+ auto tstart = mono_clock::now();
+
+ if (!throttle.try_start_transaction(
+ *db,
+ *txc,
+ tstart)) {
+ // ensure we do not block here because of deferred writes
+ dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
+ << dendl;
+ ++deferred_aggressive;
+ deferred_try_submit();
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ if (!kv_sync_in_progress) {
+ kv_sync_in_progress = true;
+ kv_cond.notify_one();
+ }
+ }
+ throttle.finish_start_transaction(*db, *txc, tstart);
+ --deferred_aggressive;
+ }
+ auto tend = mono_clock::now();
+
+ if (handle)
+ handle->reset_tp_timeout();
+
+ logger->inc(l_bluestore_txc);
+
+ // execute (start)
+ _txc_state_proc(txc);
+
+ if (bdev->is_smr()) {
+ atomic_alloc_and_submit_lock.unlock();
+ }
+
+ // we're immediately readable (unlike FileStore)
+ for (auto c : on_applied_sync) {
+ c->complete(0);
+ }
+ if (!on_applied.empty()) {
+ if (c->commit_queue) {
+ c->commit_queue->queue(on_applied);
+ } else {
+ finisher.queue(on_applied);
+ }
+ }
+
+#ifdef WITH_BLKIN
+ if (txc->trace) {
+ txc->trace.event("txc applied");
+ }
+#endif
+
+ log_latency("submit_transact",
+ l_bluestore_submit_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ log_latency("throttle_transact",
+ l_bluestore_throttle_lat,
+ tend - tstart,
+ cct->_conf->bluestore_log_op_age);
+ return 0;
+}
+
+void BlueStore::_txc_aio_submit(TransContext *txc)
+{
+ dout(10) << __func__ << " txc " << txc << dendl;
+ bdev->aio_submit(&txc->ioc);
+}
+
+void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+ Transaction::iterator i = t->begin();
+
+ _dump_transaction<30>(cct, t);
+
+ vector<CollectionRef> cvec(i.colls.size());
+ unsigned j = 0;
+ for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+ ++p, ++j) {
+ cvec[j] = _get_collection(*p);
+ }
+
+ vector<OnodeRef> ovec(i.objects.size());
+
+ for (int pos = 0; i.have_op(); ++pos) {
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ // no coll or obj
+ if (op->op == Transaction::OP_NOP)
+ continue;
+
+
+ // collection operations
+ CollectionRef &c = cvec[op->cid];
+
+ // initialize osd_pool_id and do a smoke test that all collections belong
+ // to the same pool
+ spg_t pgid;
+ if (!!c ? c->cid.is_pg(&pgid) : false) {
+ ceph_assert(txc->osd_pool_id == META_POOL_ID ||
+ txc->osd_pool_id == pgid.pool());
+ txc->osd_pool_id = pgid.pool();
+ }
+
+ switch (op->op) {
+ case Transaction::OP_RMCOLL:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ r = _remove_collection(txc, cid, &c);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ ceph_assert(!c);
+ const coll_t &cid = i.get_cid(op->cid);
+ r = _create_collection(txc, cid, op->split_bits, &c);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ uint32_t bits = op->split_bits;
+ r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ uint32_t type = op->hint;
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ dout(10) << __func__ << " collection hint objects is a no-op, "
+ << " pg_num " << pg_num << " num_objects " << num_objs
+ << dendl;
+ } else {
+ // Ignore the hint
+ dout(10) << __func__ << " unknown collection hint " << type << dendl;
+ }
+ continue;
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ r = -EOPNOTSUPP;
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ r = -EOPNOTSUPP;
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ ceph_abort_msg("not implemented");
+ break;
+ }
+ if (r < 0) {
+ derr << __func__ << " error " << cpp_strerror(r)
+ << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)" << dendl;
+ _dump_transaction<0>(cct, t);
+ ceph_abort_msg("unexpected error");
+ }
+
+ // these operations implicity create the object
+ bool create = false;
+ if (op->op == Transaction::OP_TOUCH ||
+ op->op == Transaction::OP_CREATE ||
+ op->op == Transaction::OP_WRITE ||
+ op->op == Transaction::OP_ZERO) {
+ create = true;
+ }
+
+ // object operations
+ std::unique_lock l(c->lock);
+ OnodeRef &o = ovec[op->oid];
+ if (!o) {
+ ghobject_t oid = i.get_oid(op->oid);
+ o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
+ }
+ if (!create && (!o || !o->exists)) {
+ dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+ << i.get_oid(op->oid) << dendl;
+ r = -ENOENT;
+ goto endop;
+ }
+
+ switch (op->op) {
+ case Transaction::OP_CREATE:
+ case Transaction::OP_TOUCH:
+ r = _touch(txc, c, o);
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _write(txc, c, o, off, len, bl, fadvise_flags);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _zero(txc, c, o, off, len);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ uint64_t off = op->off;
+ r = _truncate(txc, c, o, off);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ r = _remove(txc, c, o);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ string name = i.decode_string();
+ bufferptr bp;
+ i.decode_bp(bp);
+ r = _setattr(txc, c, o, name, bp);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ r = _setattrs(txc, c, o, aset);
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ string name = i.decode_string();
+ r = _rmattr(txc, c, o, name);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ r = _rmattrs(txc, c, o);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ OnodeRef& no = ovec[op->dest_oid];
+ if (!no) {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ no = c->get_onode(noid, true);
+ }
+ r = _clone(txc, c, o, no);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ OnodeRef& no = ovec[op->dest_oid];
+ if (!no) {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ no = c->get_onode(noid, true);
+ }
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ ceph_abort_msg("not implemented");
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ ceph_abort_msg("not implemented");
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ case Transaction::OP_TRY_RENAME:
+ {
+ ceph_assert(op->cid == op->dest_cid);
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ OnodeRef& no = ovec[op->dest_oid];
+ if (!no) {
+ no = c->get_onode(noid, false);
+ }
+ r = _rename(txc, c, o, no, noid);
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ r = _omap_clear(txc, c, o);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ bufferlist aset_bl;
+ i.decode_attrset_bl(&aset_bl);
+ r = _omap_setkeys(txc, c, o, aset_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ bufferlist keys_bl;
+ i.decode_keyset_bl(&keys_bl);
+ r = _omap_rmkeys(txc, c, o, keys_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ r = _omap_rmkey_range(txc, c, o, first, last);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _omap_setheader(txc, c, o, bl);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ r = _set_alloc_hint(txc, c, o,
+ op->expected_object_size,
+ op->expected_write_size,
+ op->hint);
+ }
+ break;
+
+ default:
+ derr << __func__ << " bad op " << op->op << dendl;
+ ceph_abort();
+ }
+
+ endop:
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD ||
+ op->op == Transaction::OP_SETATTR ||
+ op->op == Transaction::OP_SETATTRS ||
+ op->op == Transaction::OP_RMATTR ||
+ op->op == Transaction::OP_OMAP_SETKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+ op->op == Transaction::OP_OMAP_SETHEADER))
+ // -ENOENT is usually okay
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2))
+ msg = "ENOENT on clone suggests osd bug";
+
+ if (r == -ENOSPC)
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC from bluestore, misconfigured cluster";
+
+ if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ }
+
+ derr << __func__ << " error " << cpp_strerror(r)
+ << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)"
+ << dendl;
+ derr << msg << dendl;
+ _dump_transaction<0>(cct, t);
+ ceph_abort_msg("unexpected error");
+ }
+ }
+ }
+}
+
+
+
+// -----------------
+// write operations
+
+int BlueStore::_touch(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ _assign_nid(txc, o);
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_pad_zeros(
+ bufferlist *bl, uint64_t *offset,
+ uint64_t chunk_size)
+{
+ auto length = bl->length();
+ dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
+ << " chunk_size 0x" << chunk_size << std::dec << dendl;
+ dout(40) << "before:\n";
+ bl->hexdump(*_dout);
+ *_dout << dendl;
+ // front
+ size_t front_pad = *offset % chunk_size;
+ size_t back_pad = 0;
+ size_t pad_count = 0;
+ if (front_pad) {
+ size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
+ bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
+ z.zero(0, front_pad, false);
+ pad_count += front_pad;
+ bl->begin().copy(front_copy, z.c_str() + front_pad);
+ if (front_copy + front_pad < chunk_size) {
+ back_pad = chunk_size - (length + front_pad);
+ z.zero(front_pad + length, back_pad, false);
+ pad_count += back_pad;
+ }
+ bufferlist old, t;
+ old.swap(*bl);
+ t.substr_of(old, front_copy, length - front_copy);
+ bl->append(z);
+ bl->claim_append(t);
+ *offset -= front_pad;
+ length += pad_count;
+ }
+
+ // back
+ uint64_t end = *offset + length;
+ unsigned back_copy = end % chunk_size;
+ if (back_copy) {
+ ceph_assert(back_pad == 0);
+ back_pad = chunk_size - back_copy;
+ ceph_assert(back_copy <= length);
+ bufferptr tail(chunk_size);
+ bl->begin(length - back_copy).copy(back_copy, tail.c_str());
+ tail.zero(back_copy, back_pad, false);
+ bufferlist old;
+ old.swap(*bl);
+ bl->substr_of(old, 0, length - back_copy);
+ bl->append(tail);
+ length += back_pad;
+ pad_count += back_pad;
+ }
+ dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
+ << back_pad << " on front/back, now 0x" << *offset << "~"
+ << length << std::dec << dendl;
+ dout(40) << "after:\n";
+ bl->hexdump(*_dout);
+ *_dout << dendl;
+ if (pad_count)
+ logger->inc(l_bluestore_write_pad_bytes, pad_count);
+ ceph_assert(bl->length() == length);
+}
+
+void BlueStore::_do_write_small(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef& o,
+ uint64_t offset, uint64_t length,
+ bufferlist::iterator& blp,
+ WriteContext *wctx)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ ceph_assert(length < min_alloc_size);
+
+ uint64_t end_offs = offset + length;
+
+ logger->inc(l_bluestore_write_small);
+ logger->inc(l_bluestore_write_small_bytes, length);
+
+ bufferlist bl;
+ blp.copy(length, bl);
+
+ auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+ auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+ uint32_t alloc_len = min_alloc_size;
+ auto offset0 = p2align<uint64_t>(offset, alloc_len);
+
+ bool any_change;
+
+ // search suitable extent in both forward and reverse direction in
+ // [offset - target_max_blob_size, offset + target_max_blob_size] range
+ // then check if blob can be reused via can_reuse_blob func or apply
+ // direct/deferred write (the latter for extents including or higher
+ // than 'offset' only).
+ o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
+
+#ifdef HAVE_LIBZBD
+ // On zoned devices, the first goal is to support non-overwrite workloads,
+ // such as RGW, with large, aligned objects. Therefore, for user writes
+ // _do_write_small should not trigger. OSDs, however, write and update a tiny
+ // amount of metadata, such as OSD maps, to disk. For those cases, we
+ // temporarily just pad them to min_alloc_size and write them to a new place
+ // on every update.
+ if (bdev->is_smr()) {
+ uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+ uint64_t b_off0 = b_off;
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+ // Zero detection -- small block
+ if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+ BlobRef b = c->new_blob();
+ _pad_zeros(&bl, &b_off0, min_alloc_size);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
+ } else { // if (bl.is_zero())
+ dout(20) << __func__ << " skip small zero block " << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_write_small_skipped);
+ logger->inc(l_bluestore_write_small_skipped_bytes, length);
+ }
+
+ return;
+ }
+#endif
+
+ // Look for an existing mutable blob we can use.
+ auto begin = o->extent_map.extent_map.begin();
+ auto end = o->extent_map.extent_map.end();
+ auto ep = o->extent_map.seek_lextent(offset);
+ if (ep != begin) {
+ --ep;
+ if (ep->blob_end() <= offset) {
+ ++ep;
+ }
+ }
+ auto prev_ep = end;
+ if (ep != begin) {
+ prev_ep = ep;
+ --prev_ep;
+ }
+
+ boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
+ // We don't want to have more blobs than min alloc units fit
+ // into 2 max blobs
+ size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
+ bool above_blob_threshold = false;
+
+ inspected_blobs.reserve(blob_threshold);
+
+ uint64_t max_off = 0;
+ auto start_ep = ep;
+ auto end_ep = ep; // exclusively
+ do {
+ any_change = false;
+
+ if (ep != end && ep->logical_offset < offset + max_bsize) {
+ BlobRef b = ep->blob;
+ if (!above_blob_threshold) {
+ inspected_blobs.insert(&b->get_blob());
+ above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+ }
+ max_off = ep->logical_end();
+ auto bstart = ep->blob_start();
+
+ dout(20) << __func__ << " considering " << *b
+ << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+ if (bstart >= end_offs) {
+ dout(20) << __func__ << " ignoring distant " << *b << dendl;
+ } else if (!b->get_blob().is_mutable()) {
+ dout(20) << __func__ << " ignoring immutable " << *b << dendl;
+ } else if (ep->logical_offset % min_alloc_size !=
+ ep->blob_offset % min_alloc_size) {
+ dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
+ } else {
+ uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+ // can we pad our head/tail out with zeros?
+ uint64_t head_pad, tail_pad;
+ head_pad = p2phase(offset, chunk_size);
+ tail_pad = p2nphase(end_offs, chunk_size);
+ if (head_pad || tail_pad) {
+ o->extent_map.fault_range(db, offset - head_pad,
+ end_offs - offset + head_pad + tail_pad);
+ }
+ if (head_pad &&
+ o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
+ head_pad = 0;
+ }
+ if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
+ tail_pad = 0;
+ }
+
+ uint64_t b_off = offset - head_pad - bstart;
+ uint64_t b_len = length + head_pad + tail_pad;
+
+ // direct write into unused blocks of an existing mutable blob?
+ if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
+ b->get_blob().get_ondisk_length() >= b_off + b_len &&
+ b->get_blob().is_unused(b_off, b_len) &&
+ b->get_blob().is_allocated(b_off, b_len)) {
+ _apply_padding(head_pad, tail_pad, bl);
+
+ dout(20) << __func__ << " write to unused 0x" << std::hex
+ << b_off << "~" << b_len
+ << " pad 0x" << head_pad << " + 0x" << tail_pad
+ << std::dec << " of mutable " << *b << dendl;
+ _buffer_cache_write(txc, b, b_off, bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ if (b_len < prefer_deferred_size) {
+ dout(20) << __func__ << " deferring small 0x" << std::hex
+ << b_len << std::dec << " unused write via deferred" << dendl;
+ bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ b->get_blob().map(
+ b_off, b_len,
+ [&](uint64_t offset, uint64_t length) {
+ op->extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ op->data = bl;
+ } else {
+ b->get_blob().map_bl(
+ b_off, bl,
+ [&](uint64_t offset, bufferlist& t) {
+ bdev->aio_write(offset, t,
+ &txc->ioc, wctx->buffered);
+ });
+ }
+ }
+ b->dirty_blob().calc_csum(b_off, bl);
+ dout(20) << __func__ << " lex old " << *ep << dendl;
+ Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
+ b,
+ &wctx->old_extents);
+ b->dirty_blob().mark_used(le->blob_offset, le->length);
+
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ logger->inc(l_bluestore_write_small_unused);
+ return;
+ }
+ // read some data to fill out the chunk?
+ uint64_t head_read = p2phase(b_off, chunk_size);
+ uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
+ if ((head_read || tail_read) &&
+ (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
+ head_read + tail_read < min_alloc_size) {
+ b_off -= head_read;
+ b_len += head_read + tail_read;
+
+ } else {
+ head_read = tail_read = 0;
+ }
+
+ // chunk-aligned deferred overwrite?
+ if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
+ b_off % chunk_size == 0 &&
+ b_len % chunk_size == 0 &&
+ b->get_blob().is_allocated(b_off, b_len)) {
+
+ _apply_padding(head_pad, tail_pad, bl);
+
+ dout(20) << __func__ << " reading head 0x" << std::hex << head_read
+ << " and tail 0x" << tail_read << std::dec << dendl;
+ if (head_read) {
+ bufferlist head_bl;
+ int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
+ head_bl, 0);
+ ceph_assert(r >= 0 && r <= (int)head_read);
+ size_t zlen = head_read - r;
+ if (zlen) {
+ head_bl.append_zero(zlen);
+ logger->inc(l_bluestore_write_pad_bytes, zlen);
+ }
+ head_bl.claim_append(bl);
+ bl.swap(head_bl);
+ logger->inc(l_bluestore_write_penalty_read_ops);
+ }
+ if (tail_read) {
+ bufferlist tail_bl;
+ int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
+ tail_bl, 0);
+ ceph_assert(r >= 0 && r <= (int)tail_read);
+ size_t zlen = tail_read - r;
+ if (zlen) {
+ tail_bl.append_zero(zlen);
+ logger->inc(l_bluestore_write_pad_bytes, zlen);
+ }
+ bl.claim_append(tail_bl);
+ logger->inc(l_bluestore_write_penalty_read_ops);
+ }
+ logger->inc(l_bluestore_write_small_pre_read);
+
+ _buffer_cache_write(txc, b, b_off, bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ b->dirty_blob().calc_csum(b_off, bl);
+
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ int r = b->get_blob().map(
+ b_off, b_len,
+ [&](uint64_t offset, uint64_t length) {
+ op->extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ ceph_assert(r == 0);
+ op->data = std::move(bl);
+ dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
+ << b_len << std::dec << " of mutable " << *b
+ << " at " << op->extents << dendl;
+ }
+
+ Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
+ b, &wctx->old_extents);
+ b->dirty_blob().mark_used(le->blob_offset, le->length);
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ return;
+ }
+ // try to reuse blob if we can
+ if (b->can_reuse_blob(min_alloc_size,
+ max_bsize,
+ offset0 - bstart,
+ &alloc_len)) {
+ ceph_assert(alloc_len == min_alloc_size); // expecting data always
+ // fit into reused blob
+ // Need to check for pending writes desiring to
+ // reuse the same pextent. The rationale is that during GC two chunks
+ // from garbage blobs(compressed?) can share logical space within the same
+ // AU. That's in turn might be caused by unaligned len in clone_range2.
+ // Hence the second write will fail in an attempt to reuse blob at
+ // do_alloc_write().
+ if (!wctx->has_conflict(b,
+ offset0,
+ offset0 + alloc_len,
+ min_alloc_size)) {
+
+ // we can't reuse pad_head/pad_tail since they might be truncated
+ // due to existent extents
+ uint64_t b_off = offset - bstart;
+ uint64_t b_off0 = b_off;
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+ // Zero detection -- small block
+ if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+ _pad_zeros(&bl, &b_off0, chunk_size);
+
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ false, false);
+ logger->inc(l_bluestore_write_small_unused);
+ } else { // if (bl.is_zero())
+ dout(20) << __func__ << " skip small zero block " << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_write_small_skipped);
+ logger->inc(l_bluestore_write_small_skipped_bytes, length);
+ }
+
+ return;
+ }
+ }
+ }
+ ++ep;
+ end_ep = ep;
+ any_change = true;
+ } // if (ep != end && ep->logical_offset < offset + max_bsize)
+
+ // check extent for reuse in reverse order
+ if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+ BlobRef b = prev_ep->blob;
+ if (!above_blob_threshold) {
+ inspected_blobs.insert(&b->get_blob());
+ above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+ }
+ start_ep = prev_ep;
+ auto bstart = prev_ep->blob_start();
+ dout(20) << __func__ << " considering " << *b
+ << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+ if (b->can_reuse_blob(min_alloc_size,
+ max_bsize,
+ offset0 - bstart,
+ &alloc_len)) {
+ ceph_assert(alloc_len == min_alloc_size); // expecting data always
+ // fit into reused blob
+ // Need to check for pending writes desiring to
+ // reuse the same pextent. The rationale is that during GC two chunks
+ // from garbage blobs(compressed?) can share logical space within the same
+ // AU. That's in turn might be caused by unaligned len in clone_range2.
+ // Hence the second write will fail in an attempt to reuse blob at
+ // do_alloc_write().
+ if (!wctx->has_conflict(b,
+ offset0,
+ offset0 + alloc_len,
+ min_alloc_size)) {
+
+ uint64_t b_off = offset - bstart;
+ uint64_t b_off0 = b_off;
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+ // Zero detection -- small block
+ if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+ uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+ _pad_zeros(&bl, &b_off0, chunk_size);
+
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ false, false);
+ logger->inc(l_bluestore_write_small_unused);
+ } else { // if (bl.is_zero())
+ dout(20) << __func__ << " skip small zero block " << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_write_small_skipped);
+ logger->inc(l_bluestore_write_small_skipped_bytes, length);
+ }
+
+ return;
+ }
+ }
+ if (prev_ep != begin) {
+ --prev_ep;
+ any_change = true;
+ } else {
+ prev_ep = end; // to avoid useless first extent re-check
+ }
+ } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
+ } while (any_change);
+
+ if (above_blob_threshold) {
+ dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
+ << " " << std::hex << min_off << "~" << max_off << std::dec
+ << dendl;
+ ceph_assert(start_ep != end_ep);
+ for (auto ep = start_ep; ep != end_ep; ++ep) {
+ dout(20) << __func__ << " inserting for GC "
+ << std::hex << ep->logical_offset << "~" << ep->length
+ << std::dec << dendl;
+
+ wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
+ }
+ // insert newly written extent to GC
+ wctx->extents_to_gc.union_insert(offset, length);
+ dout(20) << __func__ << " inserting (last) for GC "
+ << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ }
+ uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+ uint64_t b_off0 = b_off;
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+ // Zero detection -- small block
+ if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+ // new blob.
+ BlobRef b = c->new_blob();
+ _pad_zeros(&bl, &b_off0, block_size);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
+ // doesn't match disk one only
+ true);
+ } else { // if (bl.is_zero())
+ dout(20) << __func__ << " skip small zero block " << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_write_small_skipped);
+ logger->inc(l_bluestore_write_small_skipped_bytes, length);
+ }
+
+ return;
+}
+
+bool BlueStore::BigDeferredWriteContext::can_defer(
+ BlueStore::extent_map_t::iterator ep,
+ uint64_t prefer_deferred_size,
+ uint64_t block_size,
+ uint64_t offset,
+ uint64_t l)
+{
+ bool res = false;
+ auto& blob = ep->blob->get_blob();
+ if (offset >= ep->blob_start() &&
+ blob.is_mutable()) {
+ off = offset;
+ b_off = offset - ep->blob_start();
+ uint64_t chunk_size = blob.get_chunk_size(block_size);
+ uint64_t ondisk = blob.get_ondisk_length();
+ used = std::min(l, ondisk - b_off);
+
+ // will read some data to fill out the chunk?
+ head_read = p2phase<uint64_t>(b_off, chunk_size);
+ tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
+ b_off -= head_read;
+
+ ceph_assert(b_off % chunk_size == 0);
+ ceph_assert(blob_aligned_len() % chunk_size == 0);
+
+ res = blob_aligned_len() < prefer_deferred_size &&
+ blob_aligned_len() <= ondisk &&
+ blob.is_allocated(b_off, blob_aligned_len());
+ if (res) {
+ blob_ref = ep->blob;
+ blob_start = ep->blob_start();
+ }
+ }
+ return res;
+}
+
+bool BlueStore::BigDeferredWriteContext::apply_defer()
+{
+ int r = blob_ref->get_blob().map(
+ b_off, blob_aligned_len(),
+ [&](const bluestore_pextent_t& pext,
+ uint64_t offset,
+ uint64_t length) {
+ // apply deferred if overwrite breaks blob continuity only.
+ // if it totally overlaps some pextent - fallback to regular write
+ if (pext.offset < offset ||
+ pext.end() > offset + length) {
+ res_extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ }
+ return -1;
+ });
+ return r >= 0;
+}
+
+void BlueStore::_do_write_big_apply_deferred(
+ TransContext* txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ BlueStore::BigDeferredWriteContext& dctx,
+ bufferlist::iterator& blp,
+ WriteContext* wctx)
+{
+ bufferlist bl;
+ dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
+ << " and tail 0x" << dctx.tail_read << std::dec << dendl;
+ if (dctx.head_read) {
+ int r = _do_read(c.get(), o,
+ dctx.off - dctx.head_read,
+ dctx.head_read,
+ bl,
+ 0);
+ ceph_assert(r >= 0 && r <= (int)dctx.head_read);
+ size_t zlen = dctx.head_read - r;
+ if (zlen) {
+ bl.append_zero(zlen);
+ logger->inc(l_bluestore_write_pad_bytes, zlen);
+ }
+ logger->inc(l_bluestore_write_penalty_read_ops);
+ }
+ blp.copy(dctx.used, bl);
+
+ if (dctx.tail_read) {
+ bufferlist tail_bl;
+ int r = _do_read(c.get(), o,
+ dctx.off + dctx.used, dctx.tail_read,
+ tail_bl, 0);
+ ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
+ size_t zlen = dctx.tail_read - r;
+ if (zlen) {
+ tail_bl.append_zero(zlen);
+ logger->inc(l_bluestore_write_pad_bytes, zlen);
+ }
+ bl.claim_append(tail_bl);
+ logger->inc(l_bluestore_write_penalty_read_ops);
+ }
+ auto& b0 = dctx.blob_ref;
+ _buffer_cache_write(txc, b0, dctx.b_off, bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ b0->dirty_blob().calc_csum(dctx.b_off, bl);
+
+ Extent* le = o->extent_map.set_lextent(c, dctx.off,
+ dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
+
+ // in fact this is a no-op for big writes but left here to maintain
+ // uniformity and avoid missing after some refactor.
+ b0->dirty_blob().mark_used(le->blob_offset, le->length);
+ txc->statfs_delta.stored() += le->length;
+
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ op->extents.swap(dctx.res_extents);
+ op->data = std::move(bl);
+ }
+}
+
+void BlueStore::_do_write_big(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef& o,
+ uint64_t offset, uint64_t length,
+ bufferlist::iterator& blp,
+ WriteContext *wctx)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " target_blob_size 0x" << wctx->target_blob_size << std::dec
+ << " compress " << (int)wctx->compress
+ << dendl;
+ logger->inc(l_bluestore_write_big);
+ logger->inc(l_bluestore_write_big_bytes, length);
+ auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+ uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
+ while (length > 0) {
+ bool new_blob = false;
+ BlobRef b;
+ uint32_t b_off = 0;
+ uint32_t l = 0;
+
+ //attempting to reuse existing blob
+ if (!wctx->compress) {
+ // enforce target blob alignment with max_bsize
+ l = max_bsize - p2phase(offset, max_bsize);
+ l = std::min(uint64_t(l), length);
+
+ auto end = o->extent_map.extent_map.end();
+
+ dout(20) << __func__ << " may be defer: 0x" << std::hex
+ << offset << "~" << l
+ << std::dec << dendl;
+
+ if (prefer_deferred_size_snapshot &&
+ l <= prefer_deferred_size_snapshot * 2) {
+ // Single write that spans two adjusted existing blobs can result
+ // in up to two deferred blocks of 'prefer_deferred_size'
+ // So we're trying to minimize the amount of resulting blobs
+ // and preserve 2 blobs rather than inserting one more in between
+ // E.g. write 0x10000~20000 over existing blobs
+ // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
+ // performance point of view) to result in two deferred writes to
+ // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
+
+ // look for an existing mutable blob we can write into
+ auto ep = o->extent_map.seek_lextent(offset);
+ auto ep_next = end;
+ BigDeferredWriteContext head_info, tail_info;
+
+ bool will_defer = ep != end ?
+ head_info.can_defer(ep,
+ prefer_deferred_size_snapshot,
+ block_size,
+ offset,
+ l) :
+ false;
+ auto offset_next = offset + head_info.used;
+ auto remaining = l - head_info.used;
+ if (will_defer && remaining) {
+ will_defer = false;
+ if (remaining <= prefer_deferred_size_snapshot) {
+ ep_next = o->extent_map.seek_lextent(offset_next);
+ // check if we can defer remaining totally
+ will_defer = ep_next == end ?
+ false :
+ tail_info.can_defer(ep_next,
+ prefer_deferred_size_snapshot,
+ block_size,
+ offset_next,
+ remaining);
+ will_defer = will_defer && remaining == tail_info.used;
+ }
+ }
+ if (will_defer) {
+ dout(20) << __func__ << " " << *(head_info.blob_ref)
+ << " deferring big " << std::hex
+ << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
+ << std::dec << " write via deferred"
+ << dendl;
+ if (remaining) {
+ dout(20) << __func__ << " " << *(tail_info.blob_ref)
+ << " deferring big " << std::hex
+ << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
+ << std::dec << " write via deferred"
+ << dendl;
+ }
+
+ will_defer = head_info.apply_defer();
+ if (!will_defer) {
+ dout(20) << __func__
+ << " deferring big fell back, head isn't continuous"
+ << dendl;
+ } else if (remaining) {
+ will_defer = tail_info.apply_defer();
+ if (!will_defer) {
+ dout(20) << __func__
+ << " deferring big fell back, tail isn't continuous"
+ << dendl;
+ }
+ }
+ }
+ if (will_defer) {
+ _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
+ if (remaining) {
+ _do_write_big_apply_deferred(txc, c, o, tail_info,
+ blp, wctx);
+ }
+ dout(20) << __func__ << " defer big: 0x" << std::hex
+ << offset << "~" << l
+ << std::dec << dendl;
+ offset += l;
+ length -= l;
+ logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
+ logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
+ continue;
+ }
+ }
+ dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
+
+ o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+
+ // seek again as punch_hole could invalidate ep
+ auto ep = o->extent_map.seek_lextent(offset);
+ auto begin = o->extent_map.extent_map.begin();
+ auto prev_ep = end;
+ if (ep != begin) {
+ prev_ep = ep;
+ --prev_ep;
+ }
+
+ auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+ // search suitable extent in both forward and reverse direction in
+ // [offset - target_max_blob_size, offset + target_max_blob_size] range
+ // then check if blob can be reused via can_reuse_blob func.
+ bool any_change;
+ do {
+ any_change = false;
+ if (ep != end && ep->logical_offset < offset + max_bsize) {
+ dout(20) << __func__ << " considering " << *ep
+ << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
+
+ if (offset >= ep->blob_start() &&
+ ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+ offset - ep->blob_start(),
+ &l)) {
+ b = ep->blob;
+ b_off = offset - ep->blob_start();
+ prev_ep = end; // to avoid check below
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+ } else {
+ ++ep;
+ any_change = true;
+ }
+ }
+
+ if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+ dout(20) << __func__ << " considering rev " << *prev_ep
+ << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
+ if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+ offset - prev_ep->blob_start(),
+ &l)) {
+ b = prev_ep->blob;
+ b_off = offset - prev_ep->blob_start();
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+ } else if (prev_ep != begin) {
+ --prev_ep;
+ any_change = true;
+ } else {
+ prev_ep = end; // to avoid useless first extent re-check
+ }
+ }
+ } while (b == nullptr && any_change);
+ } else {
+ // trying to utilize as longer chunk as permitted in case of compression.
+ l = std::min(max_bsize, length);
+ o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+ } // if (!wctx->compress)
+
+ if (b == nullptr) {
+ b = c->new_blob();
+ b_off = 0;
+ new_blob = true;
+ }
+ bufferlist t;
+ blp.copy(l, t);
+
+ // Zero detection -- big block
+ if (!cct->_conf->bluestore_zero_block_detection || !t.is_zero()) {
+ wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
+
+ dout(20) << __func__ << " schedule write big: 0x"
+ << std::hex << offset << "~" << l << std::dec
+ << (new_blob ? " new " : " reuse ")
+ << *b << dendl;
+
+ logger->inc(l_bluestore_write_big_blobs);
+ } else { // if (!t.is_zero())
+ dout(20) << __func__ << " skip big zero block " << std::hex
+ << " (0x" << b_off << "~" << t.length() << ")"
+ << " (0x" << b_off << "~" << l << ")"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_write_big_skipped_blobs);
+ logger->inc(l_bluestore_write_big_skipped_bytes, l);
+ }
+
+ offset += l;
+ length -= l;
+ }
+}
+
+int BlueStore::_do_alloc_write(
+ TransContext *txc,
+ CollectionRef coll,
+ OnodeRef& o,
+ WriteContext *wctx)
+{
+ dout(20) << __func__ << " txc " << txc
+ << " " << wctx->writes.size() << " blobs"
+ << dendl;
+ if (wctx->writes.empty()) {
+ return 0;
+ }
+
+ CompressorRef c;
+ double crr = 0;
+ if (wctx->compress) {
+ c = select_option(
+ "compression_algorithm",
+ compressor,
+ [&]() {
+ string val;
+ if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
+ CompressorRef cp = compressor;
+ if (!cp || cp->get_type_name() != val) {
+ cp = Compressor::create(cct, val);
+ if (!cp) {
+ if (_set_compression_alert(false, val.c_str())) {
+ derr << __func__ << " unable to initialize " << val.c_str()
+ << " compressor" << dendl;
+ }
+ }
+ }
+ return std::optional<CompressorRef>(cp);
+ }
+ return std::optional<CompressorRef>();
+ }
+ );
+
+ crr = select_option(
+ "compression_required_ratio",
+ cct->_conf->bluestore_compression_required_ratio,
+ [&]() {
+ double val;
+ if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+ return std::optional<double>(val);
+ }
+ return std::optional<double>();
+ }
+ );
+ }
+
+ // checksum
+ int64_t csum = csum_type.load();
+ csum = select_option(
+ "csum_type",
+ csum,
+ [&]() {
+ int64_t val;
+ if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+ return std::optional<int64_t>(val);
+ }
+ return std::optional<int64_t>();
+ }
+ );
+
+ // compress (as needed) and calc needed space
+ uint64_t need = 0;
+ uint64_t data_size = 0;
+ // 'need' is amount of space that must be provided by allocator.
+ // 'data_size' is a size of data that will be transferred to disk.
+ // Note that data_size is always <= need. This comes from:
+ // - write to blob was unaligned, and there is free space
+ // - data has been compressed
+ //
+ // We make one decision and apply it to all blobs.
+ // All blobs will be deferred or none will.
+ // We assume that allocator does its best to provide contiguous space,
+ // and the condition is : (data_size < deferred).
+
+ auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+ for (auto& wi : wctx->writes) {
+ if (c && wi.blob_length > min_alloc_size) {
+ auto start = mono_clock::now();
+
+ // compress
+ ceph_assert(wi.b_off == 0);
+ ceph_assert(wi.blob_length == wi.bl.length());
+
+ // FIXME: memory alignment here is bad
+ bufferlist t;
+ std::optional<int32_t> compressor_message;
+ int r = c->compress(wi.bl, t, compressor_message);
+ uint64_t want_len_raw = wi.blob_length * crr;
+ uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
+ bool rejected = false;
+ uint64_t compressed_len = t.length();
+ // do an approximate (fast) estimation for resulting blob size
+ // that doesn't take header overhead into account
+ uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
+ if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
+ bluestore_compression_header_t chdr;
+ chdr.type = c->get_type();
+ chdr.length = t.length();
+ chdr.compressor_message = compressor_message;
+ encode(chdr, wi.compressed_bl);
+ wi.compressed_bl.claim_append(t);
+
+ compressed_len = wi.compressed_bl.length();
+ result_len = p2roundup(compressed_len, min_alloc_size);
+ if (result_len <= want_len && result_len < wi.blob_length) {
+ // Cool. We compressed at least as much as we were hoping to.
+ // pad out to min_alloc_size
+ wi.compressed_bl.append_zero(result_len - compressed_len);
+ wi.compressed_len = compressed_len;
+ wi.compressed = true;
+ logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
+ dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
+ << " -> 0x" << compressed_len << " => 0x" << result_len
+ << " with " << c->get_type()
+ << std::dec << dendl;
+ txc->statfs_delta.compressed() += compressed_len;
+ txc->statfs_delta.compressed_original() += wi.blob_length;
+ txc->statfs_delta.compressed_allocated() += result_len;
+ logger->inc(l_bluestore_compress_success_count);
+ need += result_len;
+ data_size += result_len;
+ } else {
+ rejected = true;
+ }
+ } else if (r != 0) {
+ dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
+ << " bytes compressed using " << c->get_type_name()
+ << std::dec
+ << " failed with errcode = " << r
+ << ", leaving uncompressed"
+ << dendl;
+ logger->inc(l_bluestore_compress_rejected_count);
+ need += wi.blob_length;
+ data_size += wi.bl.length();
+ } else {
+ rejected = true;
+ }
+
+ if (rejected) {
+ dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
+ << " compressed to 0x" << compressed_len << " -> 0x" << result_len
+ << " with " << c->get_type()
+ << ", which is more than required 0x" << want_len_raw
+ << " -> 0x" << want_len
+ << ", leaving uncompressed"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_compress_rejected_count);
+ need += wi.blob_length;
+ data_size += wi.bl.length();
+ }
+ log_latency("compress@_do_alloc_write",
+ l_bluestore_compress_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age );
+ } else {
+ need += wi.blob_length;
+ data_size += wi.bl.length();
+ }
+ }
+ PExtentVector prealloc;
+ prealloc.reserve(2 * wctx->writes.size());
+ int64_t prealloc_left = 0;
+ prealloc_left = alloc->allocate(
+ need, min_alloc_size, need,
+ 0, &prealloc);
+ if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
+ derr << __func__ << " failed to allocate 0x" << std::hex << need
+ << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
+ << " min_alloc_size 0x" << min_alloc_size
+ << " available 0x " << alloc->get_free()
+ << std::dec << dendl;
+ if (prealloc.size()) {
+ alloc->release(prealloc);
+ }
+ return -ENOSPC;
+ }
+ _collect_allocation_stats(need, min_alloc_size, prealloc);
+
+ dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size
+ << " prealloc " << prealloc << dendl;
+ auto prealloc_pos = prealloc.begin();
+ ceph_assert(prealloc_pos != prealloc.end());
+
+ for (auto& wi : wctx->writes) {
+ bluestore_blob_t& dblob = wi.b->dirty_blob();
+ uint64_t b_off = wi.b_off;
+ bufferlist *l = &wi.bl;
+ uint64_t final_length = wi.blob_length;
+ uint64_t csum_length = wi.blob_length;
+ if (wi.compressed) {
+ final_length = wi.compressed_bl.length();
+ csum_length = final_length;
+ unsigned csum_order = std::countr_zero(csum_length);
+ l = &wi.compressed_bl;
+ dblob.set_compressed(wi.blob_length, wi.compressed_len);
+ if (csum != Checksummer::CSUM_NONE) {
+ dout(20) << __func__
+ << " initialize csum setting for compressed blob " << *wi.b
+ << " csum_type " << Checksummer::get_csum_type_string(csum)
+ << " csum_order " << csum_order
+ << " csum_length 0x" << std::hex << csum_length
+ << " blob_length 0x" << wi.blob_length
+ << " compressed_length 0x" << wi.compressed_len << std::dec
+ << dendl;
+ dblob.init_csum(csum, csum_order, csum_length);
+ }
+ } else if (wi.new_blob) {
+ unsigned csum_order;
+ // initialize newly created blob only
+ ceph_assert(dblob.is_mutable());
+ if (l->length() != wi.blob_length) {
+ // hrm, maybe we could do better here, but let's not bother.
+ dout(20) << __func__ << " forcing csum_order to block_size_order "
+ << block_size_order << dendl;
+ csum_order = block_size_order;
+ } else {
+ csum_order = std::min<unsigned>(wctx->csum_order, std::countr_zero(l->length()));
+ }
+ // try to align blob with max_blob_size to improve
+ // its reuse ratio, e.g. in case of reverse write
+ uint32_t suggested_boff =
+ (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
+ if ((suggested_boff % (1 << csum_order)) == 0 &&
+ suggested_boff + final_length <= max_bsize &&
+ suggested_boff > b_off) {
+ dout(20) << __func__ << " forcing blob_offset to 0x"
+ << std::hex << suggested_boff << std::dec << dendl;
+ ceph_assert(suggested_boff >= b_off);
+ csum_length += suggested_boff - b_off;
+ b_off = suggested_boff;
+ }
+ if (csum != Checksummer::CSUM_NONE) {
+ dout(20) << __func__
+ << " initialize csum setting for new blob " << *wi.b
+ << " csum_type " << Checksummer::get_csum_type_string(csum)
+ << " csum_order " << csum_order
+ << " csum_length 0x" << std::hex << csum_length << std::dec
+ << dendl;
+ dblob.init_csum(csum, csum_order, csum_length);
+ }
+ }
+
+ PExtentVector extents;
+ int64_t left = final_length;
+ auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
+ while (left > 0) {
+ ceph_assert(prealloc_left > 0);
+ if (prealloc_pos->length <= left) {
+ prealloc_left -= prealloc_pos->length;
+ left -= prealloc_pos->length;
+ txc->statfs_delta.allocated() += prealloc_pos->length;
+ extents.push_back(*prealloc_pos);
+ ++prealloc_pos;
+ } else {
+ extents.emplace_back(prealloc_pos->offset, left);
+ prealloc_pos->offset += left;
+ prealloc_pos->length -= left;
+ prealloc_left -= left;
+ txc->statfs_delta.allocated() += left;
+ left = 0;
+ break;
+ }
+ }
+ for (auto& p : extents) {
+ txc->allocated.insert(p.offset, p.length);
+ }
+ dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
+
+ dout(20) << __func__ << " blob " << *wi.b << dendl;
+ if (dblob.has_csum()) {
+ dblob.calc_csum(b_off, *l);
+ }
+
+ if (wi.mark_unused) {
+ ceph_assert(!dblob.is_compressed());
+ auto b_end = b_off + wi.bl.length();
+ if (b_off) {
+ dblob.add_unused(0, b_off);
+ }
+ uint64_t llen = dblob.get_logical_length();
+ if (b_end < llen) {
+ dblob.add_unused(b_end, llen - b_end);
+ }
+ }
+
+ Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
+ b_off + (wi.b_off0 - wi.b_off),
+ wi.length0,
+ wi.b,
+ nullptr);
+ wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ _buffer_cache_write(txc, wi.b, b_off, wi.bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ // queue io
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ if (data_size < prefer_deferred_size_snapshot) {
+ dout(20) << __func__ << " deferring 0x" << std::hex
+ << l->length() << std::dec << " write via deferred" << dendl;
+ bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ int r = wi.b->get_blob().map(
+ b_off, l->length(),
+ [&](uint64_t offset, uint64_t length) {
+ op->extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ ceph_assert(r == 0);
+ op->data = *l;
+ } else {
+ wi.b->get_blob().map_bl(
+ b_off, *l,
+ [&](uint64_t offset, bufferlist& t) {
+ bdev->aio_write(offset, t, &txc->ioc, false);
+ });
+ logger->inc(l_bluestore_write_new);
+ }
+ }
+ }
+ ceph_assert(prealloc_pos == prealloc.end());
+ ceph_assert(prealloc_left == 0);
+ return 0;
+}
+
+void BlueStore::_wctx_finish(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ WriteContext *wctx,
+ set<SharedBlob*> *maybe_unshared_blobs)
+{
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ for (auto& w : wctx->writes) {
+ for (auto& e : w.b->get_blob().get_extents()) {
+ if (!e.is_valid()) {
+ continue;
+ }
+ uint32_t zone = e.offset / zone_size;
+ if (!o->onode.zone_offset_refs.count(zone)) {
+ uint64_t zoff = e.offset % zone_size;
+ dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
+ << " offset 0x" << zoff << std::dec << dendl;
+ txc->note_write_zone_offset(o, zone, zoff);
+ }
+ }
+ }
+ }
+ set<uint32_t> zones_with_releases;
+#endif
+
+ auto oep = wctx->old_extents.begin();
+ while (oep != wctx->old_extents.end()) {
+ auto &lo = *oep;
+ oep = wctx->old_extents.erase(oep);
+ dout(20) << __func__ << " lex_old " << lo.e << dendl;
+ BlobRef b = lo.e.blob;
+ const bluestore_blob_t& blob = b->get_blob();
+ if (blob.is_compressed()) {
+ if (lo.blob_empty) {
+ txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
+ }
+ txc->statfs_delta.compressed_original() -= lo.e.length;
+ }
+ auto& r = lo.r;
+ txc->statfs_delta.stored() -= lo.e.length;
+ if (!r.empty()) {
+ dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
+ if (blob.is_shared()) {
+ PExtentVector final;
+ c->load_shared_blob(b->shared_blob);
+ bool unshare = false;
+ bool* unshare_ptr =
+ !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
+ for (auto e : r) {
+ b->shared_blob->put_ref(
+ e.offset, e.length, &final,
+ unshare_ptr);
+#ifdef HAVE_LIBZBD
+ // we also drop zone ref for shared blob extents
+ if (bdev->is_smr() && e.is_valid()) {
+ zones_with_releases.insert(e.offset / zone_size);
+ }
+#endif
+ }
+ if (unshare) {
+ ceph_assert(maybe_unshared_blobs);
+ maybe_unshared_blobs->insert(b->shared_blob.get());
+ }
+ dout(20) << __func__ << " shared_blob release " << final
+ << " from " << *b->shared_blob << dendl;
+ txc->write_shared_blob(b->shared_blob);
+ r.clear();
+ r.swap(final);
+ }
+ }
+ // we can't invalidate our logical extents as we drop them because
+ // other lextents (either in our onode or others) may still
+ // reference them. but we can throw out anything that is no
+ // longer allocated. Note that this will leave behind edge bits
+ // that are no longer referenced but not deallocated (until they
+ // age out of the cache naturally).
+ b->discard_unallocated(c.get());
+ for (auto e : r) {
+ dout(20) << __func__ << " release " << e << dendl;
+ txc->released.insert(e.offset, e.length);
+ txc->statfs_delta.allocated() -= e.length;
+ if (blob.is_compressed()) {
+ txc->statfs_delta.compressed_allocated() -= e.length;
+ }
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr() && e.is_valid()) {
+ zones_with_releases.insert(e.offset / zone_size);
+ }
+#endif
+ }
+
+ if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
+ dout(20) << __func__ << " spanning_blob_map removing empty " << *b
+ << dendl;
+ o->extent_map.spanning_blob_map.erase(b->id);
+ }
+ delete &lo;
+ }
+
+#ifdef HAVE_LIBZBD
+ if (!zones_with_releases.empty()) {
+ // we need to fault the entire extent range in here to determinte if we've dropped
+ // all refs to a zone.
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ for (auto& b : o->extent_map.extent_map) {
+ for (auto& e : b.blob->get_blob().get_extents()) {
+ if (e.is_valid()) {
+ zones_with_releases.erase(e.offset / zone_size);
+ }
+ }
+ }
+ for (auto zone : zones_with_releases) {
+ auto p = o->onode.zone_offset_refs.find(zone);
+ if (p != o->onode.zone_offset_refs.end()) {
+ dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
+ << " offset 0x" << p->second << std::dec << dendl;
+ txc->note_release_zone_offset(o, zone, p->second);
+ }
+ }
+ }
+#endif
+}
+
+void BlueStore::_do_write_data(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& bl,
+ WriteContext *wctx)
+{
+ uint64_t end = offset + length;
+ bufferlist::iterator p = bl.begin();
+
+ if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
+ (length != min_alloc_size)) {
+ // we fall within the same block
+ _do_write_small(txc, c, o, offset, length, p, wctx);
+ } else {
+ uint64_t head_offset, head_length;
+ uint64_t middle_offset, middle_length;
+ uint64_t tail_offset, tail_length;
+
+ head_offset = offset;
+ head_length = p2nphase(offset, min_alloc_size);
+
+ tail_offset = p2align(end, min_alloc_size);
+ tail_length = p2phase(end, min_alloc_size);
+
+ middle_offset = head_offset + head_length;
+ middle_length = length - head_length - tail_length;
+
+ if (head_length) {
+ _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
+ }
+
+ _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
+
+ if (tail_length) {
+ _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
+ }
+ }
+}
+
+void BlueStore::_choose_write_options(
+ CollectionRef& c,
+ OnodeRef& o,
+ uint32_t fadvise_flags,
+ WriteContext *wctx)
+{
+ if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered write" << dendl;
+ wctx->buffered = true;
+ } else if (cct->_conf->bluestore_default_buffered_write &&
+ (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+ dout(20) << __func__ << " defaulting to buffered write" << dendl;
+ wctx->buffered = true;
+ }
+
+ // apply basic csum block size
+ wctx->csum_order = block_size_order;
+
+ // compression parameters
+ unsigned alloc_hints = o->onode.alloc_hint_flags;
+ auto cm = select_option(
+ "compression_mode",
+ comp_mode.load(),
+ [&]() {
+ string val;
+ if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
+ return std::optional<Compressor::CompressionMode>(
+ Compressor::get_comp_mode_type(val));
+ }
+ return std::optional<Compressor::CompressionMode>();
+ }
+ );
+
+ wctx->compress = (cm != Compressor::COMP_NONE) &&
+ ((cm == Compressor::COMP_FORCE) ||
+ (cm == Compressor::COMP_AGGRESSIVE &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
+ (cm == Compressor::COMP_PASSIVE &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
+
+ if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
+ (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
+
+ dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
+
+ if (o->onode.expected_write_size) {
+ wctx->csum_order = std::max(min_alloc_size_order,
+ (uint8_t)std::countr_zero(o->onode.expected_write_size));
+ } else {
+ wctx->csum_order = min_alloc_size_order;
+ }
+
+ if (wctx->compress) {
+ wctx->target_blob_size = select_option(
+ "compression_max_blob_size",
+ comp_max_blob_size.load(),
+ [&]() {
+ int64_t val;
+ if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
+ return std::optional<uint64_t>((uint64_t)val);
+ }
+ return std::optional<uint64_t>();
+ }
+ );
+ }
+ } else {
+ if (wctx->compress) {
+ wctx->target_blob_size = select_option(
+ "compression_min_blob_size",
+ comp_min_blob_size.load(),
+ [&]() {
+ int64_t val;
+ if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
+ return std::optional<uint64_t>((uint64_t)val);
+ }
+ return std::optional<uint64_t>();
+ }
+ );
+ }
+ }
+
+ uint64_t max_bsize = max_blob_size.load();
+ if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
+ wctx->target_blob_size = max_bsize;
+ }
+
+ // set the min blob size floor at 2x the min_alloc_size, or else we
+ // won't be able to allocate a smaller extent for the compressed
+ // data.
+ if (wctx->compress &&
+ wctx->target_blob_size < min_alloc_size * 2) {
+ wctx->target_blob_size = min_alloc_size * 2;
+ }
+
+ dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
+ << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+ << " compress=" << (int)wctx->compress
+ << " buffered=" << (int)wctx->buffered
+ << std::dec << dendl;
+}
+
+int BlueStore::_do_gc(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const WriteContext& wctx,
+ uint64_t *dirty_start,
+ uint64_t *dirty_end)
+{
+
+ bool dirty_range_updated = false;
+ WriteContext wctx_gc;
+ wctx_gc.fork(wctx); // make a clone for garbage collection
+
+ auto & extents_to_collect = wctx.extents_to_gc;
+ for (auto it = extents_to_collect.begin();
+ it != extents_to_collect.end();
+ ++it) {
+ bufferlist bl;
+ auto offset = (*it).first;
+ auto length = (*it).second;
+ dout(20) << __func__ << " processing " << std::hex
+ << offset << "~" << length << std::dec
+ << dendl;
+ int r = _do_read(c.get(), o, offset, length, bl, 0);
+ ceph_assert(r == (int)length);
+
+ _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
+ logger->inc(l_bluestore_gc_merged, length);
+
+ if (*dirty_start > offset) {
+ *dirty_start = offset;
+ dirty_range_updated = true;
+ }
+
+ if (*dirty_end < offset + length) {
+ *dirty_end = offset + length;
+ dirty_range_updated = true;
+ }
+ }
+ if (dirty_range_updated) {
+ o->extent_map.fault_range(db, *dirty_start, *dirty_end);
+ }
+
+ dout(30) << __func__ << " alloc write" << dendl;
+ int r = _do_alloc_write(txc, c, o, &wctx_gc);
+ if (r < 0) {
+ derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ _wctx_finish(txc, c, o, &wctx_gc);
+ return 0;
+}
+
+int BlueStore::_do_write(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ int r = 0;
+
+ dout(20) << __func__
+ << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length
+ << " - have 0x" << o->onode.size
+ << " (" << std::dec << o->onode.size << ")"
+ << " bytes" << std::hex
+ << " fadvise_flags 0x" << fadvise_flags
+ << " alloc_hint 0x" << o->onode.alloc_hint_flags
+ << " expected_object_size " << o->onode.expected_object_size
+ << " expected_write_size " << o->onode.expected_write_size
+ << std::dec
+ << dendl;
+ _dump_onode<30>(cct, *o);
+
+ if (length == 0) {
+ return 0;
+ }
+
+ uint64_t end = offset + length;
+
+ GarbageCollector gc(c->store->cct);
+ int64_t benefit = 0;
+ auto dirty_start = offset;
+ auto dirty_end = end;
+
+ WriteContext wctx;
+ _choose_write_options(c, o, fadvise_flags, &wctx);
+ o->extent_map.fault_range(db, offset, length);
+ _do_write_data(txc, c, o, offset, length, bl, &wctx);
+ r = _do_alloc_write(txc, c, o, &wctx);
+ if (r < 0) {
+ derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+ << dendl;
+ goto out;
+ }
+
+ if (wctx.extents_to_gc.empty() ||
+ wctx.extents_to_gc.range_start() > offset ||
+ wctx.extents_to_gc.range_end() < offset + length) {
+ benefit = gc.estimate(offset,
+ length,
+ o->extent_map,
+ wctx.old_extents,
+ min_alloc_size);
+ }
+
+ // NB: _wctx_finish() will empty old_extents
+ // so we must do gc estimation before that
+ _wctx_finish(txc, c, o, &wctx);
+ if (end > o->onode.size) {
+ dout(20) << __func__ << " extending size to 0x" << std::hex << end
+ << std::dec << dendl;
+ o->onode.size = end;
+ }
+
+ if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
+ wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
+ dout(20) << __func__
+ << " perform garbage collection for compressed extents, "
+ << "expected benefit = " << benefit << " AUs" << dendl;
+ }
+ if (!wctx.extents_to_gc.empty()) {
+ dout(20) << __func__ << " perform garbage collection" << dendl;
+
+ r = _do_gc(txc, c, o,
+ wctx,
+ &dirty_start, &dirty_end);
+ if (r < 0) {
+ derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
+ << dendl;
+ goto out;
+ }
+ dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
+ << "~" << dirty_end - dirty_start << std::dec << dendl;
+ }
+ o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
+ o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
+
+ r = 0;
+
+ out:
+ return r;
+}
+
+int BlueStore::_write(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ int r = 0;
+ if (offset + length >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ } else {
+ _assign_nid(txc, o);
+ r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
+ txc->write_onode(o);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ int r = 0;
+ if (offset + length >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ } else {
+ _assign_nid(txc, o);
+ r = _do_zero(txc, c, o, offset, length);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_do_zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ int r = 0;
+
+ _dump_onode<30>(cct, *o);
+
+ WriteContext wctx;
+ o->extent_map.fault_range(db, offset, length);
+ o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+ o->extent_map.dirty_range(offset, length);
+ _wctx_finish(txc, c, o, &wctx);
+
+ if (length > 0 && offset + length > o->onode.size) {
+ o->onode.size = offset + length;
+ dout(20) << __func__ << " extending size to " << offset + length
+ << dendl;
+ }
+ txc->write_onode(o);
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_do_truncate(
+ TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset,
+ set<SharedBlob*> *maybe_unshared_blobs)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << std::dec << dendl;
+
+ _dump_onode<30>(cct, *o);
+
+ if (offset == o->onode.size)
+ return;
+
+ WriteContext wctx;
+ if (offset < o->onode.size) {
+ uint64_t length = o->onode.size - offset;
+ o->extent_map.fault_range(db, offset, length);
+ o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+ o->extent_map.dirty_range(offset, length);
+
+ _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
+
+ // if we have shards past EOF, ask for a reshard
+ if (!o->onode.extent_map_shards.empty() &&
+ o->onode.extent_map_shards.back().offset >= offset) {
+ dout(10) << __func__ << " request reshard past EOF" << dendl;
+ if (offset) {
+ o->extent_map.request_reshard(offset - 1, offset + length);
+ } else {
+ o->extent_map.request_reshard(0, length);
+ }
+ }
+ }
+
+ o->onode.size = offset;
+
+ txc->write_onode(o);
+}
+
+int BlueStore::_truncate(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << std::dec
+ << dendl;
+
+ auto start_time = mono_clock::now();
+ int r = 0;
+ if (offset >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ } else {
+ _do_truncate(txc, c, o, offset);
+ }
+ log_latency_fn(
+ __func__,
+ l_bluestore_truncate_lat,
+ mono_clock::now() - start_time,
+ cct->_conf->bluestore_log_op_age,
+ [&](const ceph::timespan& lat) {
+ ostringstream ostr;
+ ostr << ", lat = " << timespan_str(lat)
+ << " cid =" << c->cid
+ << " oid =" << o->oid;
+ return ostr.str();
+ }
+ );
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_do_remove(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ set<SharedBlob*> maybe_unshared_blobs;
+ bool is_gen = !o->oid.is_no_gen();
+ _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
+ if (o->onode.has_omap()) {
+ o->flush();
+ _do_omap_clear(txc, o);
+ }
+ o->exists = false;
+ string key;
+ for (auto &s : o->extent_map.shards) {
+ dout(20) << __func__ << " removing shard 0x" << std::hex
+ << s.shard_info->offset << std::dec << dendl;
+ generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
+ [&](const string& final_key) {
+ txc->t->rmkey(PREFIX_OBJ, final_key);
+ }
+ );
+ }
+ txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
+ txc->note_removed_object(o);
+ o->extent_map.clear();
+ o->onode = bluestore_onode_t();
+ _debug_obj_on_delete(o->oid);
+
+ if (!is_gen || maybe_unshared_blobs.empty()) {
+ return 0;
+ }
+
+ // see if we can unshare blobs still referenced by the head
+ dout(10) << __func__ << " gen and maybe_unshared_blobs "
+ << maybe_unshared_blobs << dendl;
+ ghobject_t nogen = o->oid;
+ nogen.generation = ghobject_t::NO_GEN;
+ OnodeRef h = c->get_onode(nogen, false);
+
+ if (!h || !h->exists) {
+ return 0;
+ }
+
+ dout(20) << __func__ << " checking for unshareable blobs on " << h
+ << " " << h->oid << dendl;
+ map<SharedBlob*,bluestore_extent_ref_map_t> expect;
+ for (auto& e : h->extent_map.extent_map) {
+ const bluestore_blob_t& b = e.blob->get_blob();
+ SharedBlob *sb = e.blob->shared_blob.get();
+ if (b.is_shared() &&
+ sb->loaded &&
+ maybe_unshared_blobs.count(sb)) {
+ if (b.is_compressed()) {
+ expect[sb].get(0, b.get_ondisk_length());
+ } else {
+ b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+ expect[sb].get(off, len);
+ return 0;
+ });
+ }
+ }
+ }
+
+ vector<SharedBlob*> unshared_blobs;
+ unshared_blobs.reserve(maybe_unshared_blobs.size());
+ for (auto& p : expect) {
+ dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
+ if (p.first->persistent->ref_map == p.second) {
+ SharedBlob *sb = p.first;
+ dout(20) << __func__ << " unsharing " << *sb << dendl;
+ unshared_blobs.push_back(sb);
+ txc->unshare_blob(sb);
+ uint64_t sbid = c->make_blob_unshared(sb);
+ string key;
+ get_shared_blob_key(sbid, &key);
+ txc->t->rmkey(PREFIX_SHARED_BLOB, key);
+ }
+ }
+
+ if (unshared_blobs.empty()) {
+ return 0;
+ }
+
+ for (auto& e : h->extent_map.extent_map) {
+ const bluestore_blob_t& b = e.blob->get_blob();
+ SharedBlob *sb = e.blob->shared_blob.get();
+ if (b.is_shared() &&
+ std::find(unshared_blobs.begin(), unshared_blobs.end(),
+ sb) != unshared_blobs.end()) {
+ dout(20) << __func__ << " unsharing " << e << dendl;
+ bluestore_blob_t& blob = e.blob->dirty_blob();
+ blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
+ h->extent_map.dirty_range(e.logical_offset, 1);
+ }
+ }
+ txc->write_onode(h);
+
+ return 0;
+}
+
+int BlueStore::_remove(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " onode " << o.get()
+ << " txc "<< txc << dendl;
+ auto start_time = mono_clock::now();
+ int r = _do_remove(txc, c, o);
+
+ log_latency_fn(
+ __func__,
+ l_bluestore_remove_lat,
+ mono_clock::now() - start_time,
+ cct->_conf->bluestore_log_op_age,
+ [&](const ceph::timespan& lat) {
+ ostringstream ostr;
+ ostr << ", lat = " << timespan_str(lat)
+ << " cid =" << c->cid
+ << " oid =" << o->oid;
+ return ostr.str();
+ }
+ );
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_setattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name,
+ bufferptr& val)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " (" << val.length() << " bytes)"
+ << dendl;
+ int r = 0;
+ if (val.is_partial()) {
+ auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+ val.length());
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ } else {
+ auto& b = o->onode.attrs[name.c_str()] = val;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " (" << val.length() << " bytes)"
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_setattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const map<string,bufferptr>& aset)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << aset.size() << " keys"
+ << dendl;
+ int r = 0;
+ for (map<string,bufferptr>::const_iterator p = aset.begin();
+ p != aset.end(); ++p) {
+ if (p->second.is_partial()) {
+ auto& b = o->onode.attrs[p->first.c_str()] =
+ bufferptr(p->second.c_str(), p->second.length());
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ } else {
+ auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+ }
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << aset.size() << " keys"
+ << " = " << r << dendl;
+ return r;
+}
+
+
+int BlueStore::_rmattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << dendl;
+ int r = 0;
+ auto it = o->onode.attrs.find(name.c_str());
+ if (it == o->onode.attrs.end())
+ goto out;
+
+ o->onode.attrs.erase(it);
+ txc->write_onode(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_rmattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+
+ if (o->onode.attrs.empty())
+ goto out;
+
+ o->onode.attrs.clear();
+ txc->write_onode(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
+{
+ const string& omap_prefix = o->get_omap_prefix();
+ string prefix, tail;
+ o->get_omap_header(&prefix);
+ o->get_omap_tail(&tail);
+ txc->t->rm_range_keys(omap_prefix, prefix, tail);
+ txc->t->rmkey(omap_prefix, tail);
+ o->onode.clear_omap_flag();
+ dout(20) << __func__ << " remove range start: "
+ << pretty_binary_string(prefix) << " end: "
+ << pretty_binary_string(tail) << dendl;
+}
+
+int BlueStore::_omap_clear(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ auto t0 = mono_clock::now();
+
+ int r = 0;
+ if (o->onode.has_omap()) {
+ o->flush();
+ _do_omap_clear(txc, o);
+ txc->write_onode(o);
+ }
+ logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_setkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist &bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r;
+ auto p = bl.cbegin();
+ __u32 num;
+ if (!o->onode.has_omap()) {
+ if (o->oid.is_pgmeta()) {
+ o->onode.set_omap_flags_pgmeta();
+ } else {
+ o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+ }
+ txc->write_onode(o);
+
+ const string& prefix = o->get_omap_prefix();
+ string key_tail;
+ bufferlist tail;
+ o->get_omap_tail(&key_tail);
+ txc->t->set(prefix, key_tail, tail);
+ } else {
+ txc->note_modified_object(o);
+ }
+ const string& prefix = o->get_omap_prefix();
+ string final_key;
+ o->get_omap_key(string(), &final_key);
+ size_t base_key_len = final_key.size();
+ decode(num, p);
+ while (num--) {
+ string key;
+ bufferlist value;
+ decode(key, p);
+ decode(value, p);
+ final_key.resize(base_key_len); // keep prefix
+ final_key += key;
+ dout(20) << __func__ << " " << pretty_binary_string(final_key)
+ << " <- " << key << dendl;
+ txc->t->set(prefix, final_key, value);
+ }
+ r = 0;
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_setheader(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r;
+ string key;
+ if (!o->onode.has_omap()) {
+ if (o->oid.is_pgmeta()) {
+ o->onode.set_omap_flags_pgmeta();
+ } else {
+ o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+ }
+ txc->write_onode(o);
+
+ const string& prefix = o->get_omap_prefix();
+ string key_tail;
+ bufferlist tail;
+ o->get_omap_tail(&key_tail);
+ txc->t->set(prefix, key_tail, tail);
+ } else {
+ txc->note_modified_object(o);
+ }
+ const string& prefix = o->get_omap_prefix();
+ o->get_omap_header(&key);
+ txc->t->set(prefix, key, bl);
+ r = 0;
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_rmkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ auto p = bl.cbegin();
+ __u32 num;
+ string final_key;
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ {
+ const string& prefix = o->get_omap_prefix();
+ o->get_omap_key(string(), &final_key);
+ size_t base_key_len = final_key.size();
+ decode(num, p);
+ logger->inc(l_bluestore_omap_rmkeys_count, num);
+ while (num--) {
+ string key;
+ decode(key, p);
+ final_key.resize(base_key_len); // keep prefix
+ final_key += key;
+ dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
+ << " <- " << key << dendl;
+ txc->t->rmkey(prefix, final_key);
+ }
+ }
+ txc->note_modified_object(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_rmkey_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& first, const string& last)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ string key_first, key_last;
+ int r = 0;
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ {
+ const string& prefix = o->get_omap_prefix();
+ o->flush();
+ o->get_omap_key(first, &key_first);
+ o->get_omap_key(last, &key_last);
+ logger->inc(l_bluestore_omap_rmkey_ranges_count);
+ txc->t->rm_range_keys(prefix, key_first, key_last);
+ dout(20) << __func__ << " remove range start: "
+ << pretty_binary_string(key_first) << " end: "
+ << pretty_binary_string(key_last) << dendl;
+ }
+ txc->note_modified_object(o);
+
+ out:
+ return r;
+}
+
+int BlueStore::_set_alloc_hint(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " object_size " << expected_object_size
+ << " write_size " << expected_write_size
+ << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+ << dendl;
+ int r = 0;
+ o->onode.expected_object_size = expected_object_size;
+ o->onode.expected_write_size = expected_write_size;
+ o->onode.alloc_hint_flags = flags;
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " object_size " << expected_object_size
+ << " write_size " << expected_write_size
+ << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_clone(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << dendl;
+ int r = 0;
+ if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+ derr << __func__ << " mismatched hash on " << oldo->oid
+ << " and " << newo->oid << dendl;
+ return -EINVAL;
+ }
+
+ _assign_nid(txc, newo);
+
+ // clone data
+ oldo->flush();
+ _do_truncate(txc, c, newo, 0);
+ if (cct->_conf->bluestore_clone_cow) {
+ _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
+ } else {
+ bufferlist bl;
+ r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
+ if (r < 0)
+ goto out;
+ r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
+ if (r < 0)
+ goto out;
+ }
+
+ // clone attrs
+ newo->onode.attrs = oldo->onode.attrs;
+
+ // clone omap
+ if (newo->onode.has_omap()) {
+ dout(20) << __func__ << " clearing old omap data" << dendl;
+ newo->flush();
+ _do_omap_clear(txc, newo);
+ }
+ if (oldo->onode.has_omap()) {
+ dout(20) << __func__ << " copying omap data" << dendl;
+ if (newo->oid.is_pgmeta()) {
+ newo->onode.set_omap_flags_pgmeta();
+ } else {
+ newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+ }
+ // check if prefix for omap key is exactly the same size for both objects
+ // otherwise rewrite_omap_key will corrupt data
+ ceph_assert(oldo->onode.flags == newo->onode.flags);
+ const string& prefix = newo->get_omap_prefix();
+ string head, tail;
+ oldo->get_omap_header(&head);
+ oldo->get_omap_tail(&tail);
+ KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ } else {
+ dout(30) << __func__ << " got header/data "
+ << pretty_binary_string(it->key()) << dendl;
+ string key;
+ newo->rewrite_omap_key(it->key(), &key);
+ txc->t->set(prefix, key, it->value());
+ }
+ it->next();
+ }
+ string new_tail;
+ bufferlist new_tail_value;
+ newo->get_omap_tail(&new_tail);
+ txc->t->set(prefix, new_tail, new_tail_value);
+ }
+
+ txc->write_onode(newo);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_do_clone_range(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff,
+ uint64_t length,
+ uint64_t dstoff)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid
+ << " 0x" << std::hex << srcoff << "~" << length << " -> "
+ << " 0x" << dstoff << "~" << length << std::dec << dendl;
+ oldo->extent_map.fault_range(db, srcoff, length);
+ newo->extent_map.fault_range(db, dstoff, length);
+ _dump_onode<30>(cct, *oldo);
+ _dump_onode<30>(cct, *newo);
+
+ oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ // duplicate the refs for the shared region.
+ Extent dummy(dstoff);
+ for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
+ e != newo->extent_map.extent_map.end();
+ ++e) {
+ if (e->logical_offset >= dstoff + length) {
+ break;
+ }
+ for (auto& ex : e->blob->get_blob().get_extents()) {
+ // note that we may introduce a new extent reference that is
+ // earlier than the first zone ref. we allow this since it is
+ // a lot of work to avoid and has marginal impact on cleaning
+ // performance.
+ if (!ex.is_valid()) {
+ continue;
+ }
+ uint32_t zone = ex.offset / zone_size;
+ if (!newo->onode.zone_offset_refs.count(zone)) {
+ uint64_t zoff = ex.offset % zone_size;
+ dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
+ << " offset 0x" << zoff << std::dec
+ << " -> " << newo->oid << dendl;
+ txc->note_write_zone_offset(newo, zone, zoff);
+ }
+ }
+ }
+ }
+#endif
+
+ _dump_onode<30>(cct, *oldo);
+ _dump_onode<30>(cct, *newo);
+ return 0;
+}
+
+int BlueStore::_clone_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+ << " to offset 0x" << dstoff << std::dec << dendl;
+ int r = 0;
+
+ if (srcoff + length >= OBJECT_MAX_SIZE ||
+ dstoff + length >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ goto out;
+ }
+ if (srcoff + length > oldo->onode.size) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ _assign_nid(txc, newo);
+
+ if (length > 0) {
+ if (cct->_conf->bluestore_clone_cow) {
+ _do_zero(txc, c, newo, dstoff, length);
+ _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
+ } else {
+ bufferlist bl;
+ r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
+ if (r < 0)
+ goto out;
+ r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
+ if (r < 0)
+ goto out;
+ }
+ }
+
+ txc->write_onode(newo);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+ << " to offset 0x" << dstoff << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_rename(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ const ghobject_t& new_oid)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << new_oid << dendl;
+ int r;
+ ghobject_t old_oid = oldo->oid;
+ mempool::bluestore_cache_meta::string new_okey;
+
+ if (newo) {
+ if (newo->exists) {
+ r = -EEXIST;
+ goto out;
+ }
+ ceph_assert(txc->onodes.count(newo) == 0);
+ }
+
+ txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
+
+ // rewrite shards
+ {
+ oldo->extent_map.fault_range(db, 0, oldo->onode.size);
+ get_object_key(cct, new_oid, &new_okey);
+ string key;
+ for (auto &s : oldo->extent_map.shards) {
+ generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
+ [&](const string& final_key) {
+ txc->t->rmkey(PREFIX_OBJ, final_key);
+ }
+ );
+ s.dirty = true;
+ }
+ }
+
+ newo = oldo;
+ txc->write_onode(newo);
+
+ // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
+ // Onode in the old slot
+ c->onode_space.rename(oldo, old_oid, new_oid, new_okey);
+ r = 0;
+
+ // hold a ref to new Onode in old name position, to ensure we don't drop
+ // it from the cache before this txc commits (or else someone may come along
+ // and read newo's metadata via the old name).
+ txc->note_modified_object(oldo);
+
+#ifdef HAVE_LIBZBD
+ if (bdev->is_smr()) {
+ // adjust zone refs
+ for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
+ dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
+ << " offset 0x" << offset << std::dec
+ << " -> " << oldo->oid << dendl;
+ string key;
+ get_zone_offset_object_key(zone, offset, oldo->oid, &key);
+ txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
+
+ dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
+ << " offset 0x" << offset << std::dec
+ << " -> " << newo->oid << dendl;
+ get_zone_offset_object_key(zone, offset, newo->oid, &key);
+ bufferlist v;
+ txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
+ }
+ }
+#endif
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+ << new_oid << " = " << r << dendl;
+ return r;
+}
+
+// collections
+
+int BlueStore::_create_collection(
+ TransContext *txc,
+ const coll_t &cid,
+ unsigned bits,
+ CollectionRef *c)
+{
+ dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+ int r;
+ bufferlist bl;
+
+ {
+ std::unique_lock l(coll_lock);
+ if (*c) {
+ r = -EEXIST;
+ goto out;
+ }
+ auto p = new_coll_map.find(cid);
+ ceph_assert(p != new_coll_map.end());
+ *c = p->second;
+ (*c)->cnode.bits = bits;
+ coll_map[cid] = *c;
+ new_coll_map.erase(p);
+ }
+ encode((*c)->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(cid), bl);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
+ CollectionRef *c)
+{
+ dout(15) << __func__ << " " << cid << dendl;
+ int r;
+
+ (*c)->flush_all_but_last();
+ {
+ std::unique_lock l(coll_lock);
+ if (!*c) {
+ r = -ENOENT;
+ goto out;
+ }
+ size_t nonexistent_count = 0;
+ ceph_assert((*c)->exists);
+ if ((*c)->onode_space.map_any([&](Onode* o) {
+ if (o->exists) {
+ dout(1) << __func__ << " " << o->oid << " " << o
+ << " exists in onode_map" << dendl;
+ return true;
+ }
+ ++nonexistent_count;
+ return false;
+ })) {
+ r = -ENOTEMPTY;
+ goto out;
+ }
+ vector<ghobject_t> ls;
+ ghobject_t next;
+ // Enumerate onodes in db, up to nonexistent_count + 1
+ // then check if all of them are marked as non-existent.
+ // Bypass the check if (next != ghobject_t::get_max())
+ r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+ nonexistent_count + 1, false, &ls, &next);
+ if (r >= 0) {
+ // If true mean collecton has more objects than nonexistent_count,
+ // so bypass check.
+ bool exists = (!next.is_max());
+ for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+ dout(10) << __func__ << " oid " << *it << dendl;
+ auto onode = (*c)->onode_space.lookup(*it);
+ exists = !onode || onode->exists;
+ if (exists) {
+ dout(1) << __func__ << " " << *it
+ << " exists in db, "
+ << (!onode ? "not present in ram" : "present in ram")
+ << dendl;
+ }
+ }
+ if (!exists) {
+ _do_remove_collection(txc, c);
+ r = 0;
+ } else {
+ dout(10) << __func__ << " " << cid
+ << " is non-empty" << dendl;
+ r = -ENOTEMPTY;
+ }
+ }
+ }
+out:
+ dout(10) << __func__ << " " << cid << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_do_remove_collection(TransContext *txc,
+ CollectionRef *c)
+{
+ coll_map.erase((*c)->cid);
+ txc->removed_collections.push_back(*c);
+ (*c)->exists = false;
+ _osr_register_zombie((*c)->osr.get());
+ txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
+ c->reset();
+}
+
+int BlueStore::_split_collection(TransContext *txc,
+ CollectionRef& c,
+ CollectionRef& d,
+ unsigned bits, int rem)
+{
+ dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+ << " bits " << bits << dendl;
+ std::unique_lock l(c->lock);
+ std::unique_lock l2(d->lock);
+ int r;
+
+ // flush all previous deferred writes on this sequencer. this is a bit
+ // heavyweight, but we need to make sure all deferred writes complete
+ // before we split as the new collection's sequencer may need to order
+ // this after those writes, and we don't bother with the complexity of
+ // moving those TransContexts over to the new osr.
+ _osr_drain_preceding(txc);
+
+ // move any cached items (onodes and referenced shared blobs) that will
+ // belong to the child collection post-split. leave everything else behind.
+ // this may include things that don't strictly belong to the now-smaller
+ // parent split, but the OSD will always send us a split for every new
+ // child.
+
+ spg_t pgid, dest_pgid;
+ bool is_pg = c->cid.is_pg(&pgid);
+ ceph_assert(is_pg);
+ is_pg = d->cid.is_pg(&dest_pgid);
+ ceph_assert(is_pg);
+
+ // the destination should initially be empty.
+ ceph_assert(d->onode_space.empty());
+ ceph_assert(d->shared_blob_set.empty());
+ ceph_assert(d->cnode.bits == bits);
+
+ c->split_cache(d.get());
+
+ // adjust bits. note that this will be redundant for all but the first
+ // split call for this parent (first child).
+ c->cnode.bits = bits;
+ ceph_assert(d->cnode.bits == bits);
+ r = 0;
+
+ bufferlist bl;
+ encode(c->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+ dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+ << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_merge_collection(
+ TransContext *txc,
+ CollectionRef *c,
+ CollectionRef& d,
+ unsigned bits)
+{
+ dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
+ << " bits " << bits << dendl;
+ std::unique_lock l((*c)->lock);
+ std::unique_lock l2(d->lock);
+ int r;
+
+ coll_t cid = (*c)->cid;
+
+ // flush all previous deferred writes on the source collection to ensure
+ // that all deferred writes complete before we merge as the target collection's
+ // sequencer may need to order new ops after those writes.
+
+ _osr_drain((*c)->osr.get());
+
+ // move any cached items (onodes and referenced shared blobs) that will
+ // belong to the child collection post-split. leave everything else behind.
+ // this may include things that don't strictly belong to the now-smaller
+ // parent split, but the OSD will always send us a split for every new
+ // child.
+
+ spg_t pgid, dest_pgid;
+ bool is_pg = cid.is_pg(&pgid);
+ ceph_assert(is_pg);
+ is_pg = d->cid.is_pg(&dest_pgid);
+ ceph_assert(is_pg);
+
+ // adjust bits. note that this will be redundant for all but the first
+ // merge call for the parent/target.
+ d->cnode.bits = bits;
+
+ // behavior depends on target (d) bits, so this after that is updated.
+ (*c)->split_cache(d.get());
+
+ // remove source collection
+ {
+ std::unique_lock l3(coll_lock);
+ _do_remove_collection(txc, c);
+ }
+
+ r = 0;
+
+ bufferlist bl;
+ encode(d->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+ dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+ << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::log_latency(
+ const char* name,
+ int idx,
+ const ceph::timespan& l,
+ double lat_threshold,
+ const char* info) const
+{
+ logger->tinc(idx, l);
+ if (lat_threshold > 0.0 &&
+ l >= make_timespan(lat_threshold)) {
+ dout(0) << __func__ << " slow operation observed for " << name
+ << ", latency = " << l
+ << info
+ << dendl;
+ }
+}
+
+void BlueStore::log_latency_fn(
+ const char* name,
+ int idx,
+ const ceph::timespan& l,
+ double lat_threshold,
+ std::function<string (const ceph::timespan& lat)> fn) const
+{
+ logger->tinc(idx, l);
+ if (lat_threshold > 0.0 &&
+ l >= make_timespan(lat_threshold)) {
+ dout(0) << __func__ << " slow operation observed for " << name
+ << ", latency = " << l
+ << fn(l)
+ << dendl;
+ }
+}
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
+ KeyValueDB &db,
+ TransContext &txc,
+ mono_clock::time_point start_throttle_acquire)
+{
+ pending_kv_ios += txc.ios;
+ if (txc.deferred_txn) {
+ pending_deferred_ios += txc.ios;
+ }
+
+ uint64_t started = 0;
+ uint64_t completed = 0;
+ if (should_trace(&started, &completed)) {
+ txc.tracing = true;
+ uint64_t rocksdb_base_level,
+ rocksdb_estimate_pending_compaction_bytes,
+ rocksdb_cur_size_all_mem_tables,
+ rocksdb_compaction_pending,
+ rocksdb_mem_table_flush_pending,
+ rocksdb_num_running_compactions,
+ rocksdb_num_running_flushes,
+ rocksdb_actual_delayed_write_rate;
+ db.get_property(
+ "rocksdb.base-level",
+ &rocksdb_base_level);
+ db.get_property(
+ "rocksdb.estimate-pending-compaction-bytes",
+ &rocksdb_estimate_pending_compaction_bytes);
+ db.get_property(
+ "rocksdb.cur-size-all-mem-tables",
+ &rocksdb_cur_size_all_mem_tables);
+ db.get_property(
+ "rocksdb.compaction-pending",
+ &rocksdb_compaction_pending);
+ db.get_property(
+ "rocksdb.mem-table-flush-pending",
+ &rocksdb_mem_table_flush_pending);
+ db.get_property(
+ "rocksdb.num-running-compactions",
+ &rocksdb_num_running_compactions);
+ db.get_property(
+ "rocksdb.num-running-flushes",
+ &rocksdb_num_running_flushes);
+ db.get_property(
+ "rocksdb.actual-delayed-write-rate",
+ &rocksdb_actual_delayed_write_rate);
+
+
+ tracepoint(
+ bluestore,
+ transaction_initial_state,
+ txc.osr->get_sequencer_id(),
+ txc.seq,
+ throttle_bytes.get_current(),
+ throttle_deferred_bytes.get_current(),
+ pending_kv_ios,
+ pending_deferred_ios,
+ started,
+ completed,
+ ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
+
+ tracepoint(
+ bluestore,
+ transaction_initial_state_rocksdb,
+ txc.osr->get_sequencer_id(),
+ txc.seq,
+ rocksdb_base_level,
+ rocksdb_estimate_pending_compaction_bytes,
+ rocksdb_cur_size_all_mem_tables,
+ rocksdb_compaction_pending,
+ rocksdb_mem_table_flush_pending,
+ rocksdb_num_running_compactions,
+ rocksdb_num_running_flushes,
+ rocksdb_actual_delayed_write_rate);
+ }
+}
+#endif
+
+mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
+ TransContext &txc, PerfCounters *logger, int state)
+{
+ mono_clock::time_point now = mono_clock::now();
+ mono_clock::duration lat = now - txc.last_stamp;
+ logger->tinc(state, lat);
+#if defined(WITH_LTTNG)
+ if (txc.tracing &&
+ state >= l_bluestore_state_prepare_lat &&
+ state <= l_bluestore_state_done_lat) {
+ OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
+ tracepoint(
+ bluestore,
+ transaction_state_duration,
+ txc.osr->get_sequencer_id(),
+ txc.seq,
+ state,
+ ceph::to_seconds<double>(lat));
+ }
+#endif
+ txc.last_stamp = now;
+ return lat;
+}
+
+bool BlueStore::BlueStoreThrottle::try_start_transaction(
+ KeyValueDB &db,
+ TransContext &txc,
+ mono_clock::time_point start_throttle_acquire)
+{
+ throttle_bytes.get(txc.cost);
+
+ if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
+ emit_initial_tracepoint(db, txc, start_throttle_acquire);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void BlueStore::BlueStoreThrottle::finish_start_transaction(
+ KeyValueDB &db,
+ TransContext &txc,
+ mono_clock::time_point start_throttle_acquire)
+{
+ ceph_assert(txc.deferred_txn);
+ throttle_deferred_bytes.get(txc.cost);
+ emit_initial_tracepoint(db, txc, start_throttle_acquire);
+}
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
+{
+ pending_kv_ios -= 1;
+ ios_completed_since_last_traced++;
+ if (txc.tracing) {
+ tracepoint(
+ bluestore,
+ transaction_commit_latency,
+ txc.osr->get_sequencer_id(),
+ txc.seq,
+ ceph::to_seconds<double>(mono_clock::now() - txc.start));
+ }
+}
+#endif
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
+{
+ if (txc.deferred_txn) {
+ pending_deferred_ios -= 1;
+ }
+ if (txc.tracing) {
+ mono_clock::time_point now = mono_clock::now();
+ mono_clock::duration lat = now - txc.start;
+ tracepoint(
+ bluestore,
+ transaction_total_duration,
+ txc.osr->get_sequencer_id(),
+ txc.seq,
+ ceph::to_seconds<double>(lat));
+ }
+}
+#endif
+
+const string prefix_onode = "o";
+const string prefix_onode_shard = "x";
+const string prefix_other = "Z";
+//Itrerates through the db and collects the stats
+void BlueStore::generate_db_histogram(Formatter *f)
+{
+ //globals
+ uint64_t num_onodes = 0;
+ uint64_t num_shards = 0;
+ uint64_t num_super = 0;
+ uint64_t num_coll = 0;
+ uint64_t num_omap = 0;
+ uint64_t num_pgmeta_omap = 0;
+ uint64_t num_deferred = 0;
+ uint64_t num_alloc = 0;
+ uint64_t num_stat = 0;
+ uint64_t num_others = 0;
+ uint64_t num_shared_shards = 0;
+ size_t max_key_size =0, max_value_size = 0;
+ uint64_t total_key_size = 0, total_value_size = 0;
+ size_t key_size = 0, value_size = 0;
+ KeyValueHistogram hist;
+
+ auto start = coarse_mono_clock::now();
+
+ KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+ iter->seek_to_first();
+ while (iter->valid()) {
+ dout(30) << __func__ << " Key: " << iter->key() << dendl;
+ key_size = iter->key_size();
+ value_size = iter->value_size();
+ hist.value_hist[hist.get_value_slab(value_size)]++;
+ max_key_size = std::max(max_key_size, key_size);
+ max_value_size = std::max(max_value_size, value_size);
+ total_key_size += key_size;
+ total_value_size += value_size;
+
+ pair<string,string> key(iter->raw_key());
+
+ if (key.first == PREFIX_SUPER) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
+ num_super++;
+ } else if (key.first == PREFIX_STAT) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
+ num_stat++;
+ } else if (key.first == PREFIX_COLL) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
+ num_coll++;
+ } else if (key.first == PREFIX_OBJ) {
+ if (key.second.back() == ONODE_KEY_SUFFIX) {
+ hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
+ num_onodes++;
+ } else {
+ hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
+ num_shards++;
+ }
+ } else if (key.first == PREFIX_OMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
+ num_omap++;
+ } else if (key.first == PREFIX_PERPOOL_OMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
+ num_omap++;
+ } else if (key.first == PREFIX_PERPG_OMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
+ num_omap++;
+ } else if (key.first == PREFIX_PGMETA_OMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
+ num_pgmeta_omap++;
+ } else if (key.first == PREFIX_DEFERRED) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
+ num_deferred++;
+ } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
+ num_alloc++;
+ } else if (key.first == PREFIX_SHARED_BLOB) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
+ num_shared_shards++;
+ } else {
+ hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
+ num_others++;
+ }
+ iter->next();
+ }
+
+ ceph::timespan duration = coarse_mono_clock::now() - start;
+ f->open_object_section("rocksdb_key_value_stats");
+ f->dump_unsigned("num_onodes", num_onodes);
+ f->dump_unsigned("num_shards", num_shards);
+ f->dump_unsigned("num_super", num_super);
+ f->dump_unsigned("num_coll", num_coll);
+ f->dump_unsigned("num_omap", num_omap);
+ f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
+ f->dump_unsigned("num_deferred", num_deferred);
+ f->dump_unsigned("num_alloc", num_alloc);
+ f->dump_unsigned("num_stat", num_stat);
+ f->dump_unsigned("num_shared_shards", num_shared_shards);
+ f->dump_unsigned("num_others", num_others);
+ f->dump_unsigned("max_key_size", max_key_size);
+ f->dump_unsigned("max_value_size", max_value_size);
+ f->dump_unsigned("total_key_size", total_key_size);
+ f->dump_unsigned("total_value_size", total_value_size);
+ f->close_section();
+
+ hist.dump(f);
+
+ dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
+
+}
+
+void BlueStore::_shutdown_cache()
+{
+ dout(10) << __func__ << dendl;
+ for (auto i : buffer_cache_shards) {
+ i->flush();
+ ceph_assert(i->empty());
+ }
+ for (auto& p : coll_map) {
+ p.second->onode_space.clear();
+ if (!p.second->shared_blob_set.empty()) {
+ derr << __func__ << " stray shared blobs on " << p.first << dendl;
+ p.second->shared_blob_set.dump<0>(cct);
+ }
+ ceph_assert(p.second->onode_space.empty());
+ ceph_assert(p.second->shared_blob_set.empty());
+ }
+ coll_map.clear();
+ for (auto i : onode_cache_shards) {
+ ceph_assert(i->empty());
+ }
+}
+
+// For external caller.
+// We use a best-effort policy instead, e.g.,
+// we don't care if there are still some pinned onodes/data in the cache
+// after this command is completed.
+int BlueStore::flush_cache(ostream *os)
+{
+ dout(10) << __func__ << dendl;
+ for (auto i : onode_cache_shards) {
+ i->flush();
+ }
+ for (auto i : buffer_cache_shards) {
+ i->flush();
+ }
+
+ return 0;
+}
+
+void BlueStore::_apply_padding(uint64_t head_pad,
+ uint64_t tail_pad,
+ bufferlist& padded)
+{
+ if (head_pad) {
+ padded.prepend_zero(head_pad);
+ }
+ if (tail_pad) {
+ padded.append_zero(tail_pad);
+ }
+ if (head_pad || tail_pad) {
+ dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
+ << " tail 0x" << tail_pad << std::dec << dendl;
+ logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
+ }
+}
+
+void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn)
+{
+ // finalize extent_map shards
+ o->extent_map.update(txn, false);
+ if (o->extent_map.needs_reshard()) {
+ o->extent_map.reshard(db, txn);
+ o->extent_map.update(txn, true);
+ if (o->extent_map.needs_reshard()) {
+ dout(20) << __func__ << " warning: still wants reshard, check options?"
+ << dendl;
+ o->extent_map.clear_needs_reshard();
+ }
+ logger->inc(l_bluestore_onode_reshard);
+ }
+
+ // bound encode
+ size_t bound = 0;
+ denc(o->onode, bound);
+ o->extent_map.bound_encode_spanning_blobs(bound);
+ if (o->onode.extent_map_shards.empty()) {
+ denc(o->extent_map.inline_bl, bound);
+ }
+
+ // encode
+ bufferlist bl;
+ unsigned onode_part, blob_part, extent_part;
+ {
+ auto p = bl.get_contiguous_appender(bound, true);
+ denc(o->onode, p);
+ onode_part = p.get_logical_offset();
+ o->extent_map.encode_spanning_blobs(p);
+ blob_part = p.get_logical_offset() - onode_part;
+ if (o->onode.extent_map_shards.empty()) {
+ denc(o->extent_map.inline_bl, p);
+ }
+ extent_part = p.get_logical_offset() - onode_part - blob_part;
+ }
+
+ dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
+ << " (" << onode_part << " bytes onode + "
+ << blob_part << " bytes spanning blobs + "
+ << extent_part << " bytes inline extents)"
+ << dendl;
+
+
+ txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+}
+
+void BlueStore::_log_alerts(osd_alert_list_t& alerts)
+{
+ std::lock_guard l(qlock);
+ size_t used = bluefs && bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ?
+ bluefs->get_used(BlueFS::BDEV_SLOW) : 0;
+ if (used > 0) {
+ auto db_used = bluefs->get_used(BlueFS::BDEV_DB);
+ auto db_total = bluefs->get_total(BlueFS::BDEV_DB);
+ ostringstream ss;
+ ss << "spilled over " << byte_u_t(used)
+ << " metadata from 'db' device (" << byte_u_t(db_used)
+ << " used of " << byte_u_t(db_total) << ") to slow device";
+ spillover_alert = ss.str();
+ } else if (!spillover_alert.empty()){
+ spillover_alert.clear();
+ }
+
+ if (!spurious_read_errors_alert.empty() &&
+ cct->_conf->bluestore_warn_on_spurious_read_errors) {
+ alerts.emplace(
+ "BLUESTORE_SPURIOUS_READ_ERRORS",
+ spurious_read_errors_alert);
+ }
+ if (!disk_size_mismatch_alert.empty()) {
+ alerts.emplace(
+ "BLUESTORE_DISK_SIZE_MISMATCH",
+ disk_size_mismatch_alert);
+ }
+ if (!legacy_statfs_alert.empty()) {
+ alerts.emplace(
+ "BLUESTORE_LEGACY_STATFS",
+ legacy_statfs_alert);
+ }
+ if (!spillover_alert.empty() &&
+ cct->_conf->bluestore_warn_on_bluefs_spillover) {
+ alerts.emplace(
+ "BLUEFS_SPILLOVER",
+ spillover_alert);
+ }
+ if (!no_per_pg_omap_alert.empty()) {
+ alerts.emplace(
+ "BLUESTORE_NO_PER_PG_OMAP",
+ no_per_pg_omap_alert);
+ }
+ if (!no_per_pool_omap_alert.empty()) {
+ alerts.emplace(
+ "BLUESTORE_NO_PER_POOL_OMAP",
+ no_per_pool_omap_alert);
+ }
+ string s0(failed_cmode);
+
+ if (!failed_compressors.empty()) {
+ if (!s0.empty()) {
+ s0 += ", ";
+ }
+ s0 += "unable to load:";
+ bool first = true;
+ for (auto& s : failed_compressors) {
+ if (first) {
+ first = false;
+ } else {
+ s0 += ", ";
+ }
+ s0 += s;
+ }
+ alerts.emplace(
+ "BLUESTORE_NO_COMPRESSION",
+ s0);
+ }
+}
+
+void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
+ const PExtentVector& extents)
+{
+ alloc_stats_count++;
+ alloc_stats_fragments += extents.size();
+ alloc_stats_size += need;
+
+ for (auto& e : extents) {
+ logger->hinc(l_bluestore_allocate_hist, e.length, need);
+ }
+}
+
+void BlueStore::_record_allocation_stats()
+{
+ // don't care about data consistency,
+ // fields can be partially modified while making the tuple
+ auto t0 = std::make_tuple(
+ alloc_stats_count.exchange(0),
+ alloc_stats_fragments.exchange(0),
+ alloc_stats_size.exchange(0));
+
+ dout(0) << " allocation stats probe "
+ << probe_count << ":"
+ << " cnt: " << std::get<0>(t0)
+ << " frags: " << std::get<1>(t0)
+ << " size: " << std::get<2>(t0)
+ << dendl;
+
+
+ //
+ // Keep the history for probes from the power-of-two sequence:
+ // -1, -2, -4, -8, -16
+ //
+ size_t base = 1;
+ for (auto& t : alloc_stats_history) {
+ dout(0) << " probe -"
+ << base + (probe_count % base) << ": "
+ << std::get<0>(t)
+ << ", " << std::get<1>(t)
+ << ", " << std::get<2>(t)
+ << dendl;
+ base <<= 1;
+ }
+ dout(0) << "------------" << dendl;
+
+ ++ probe_count;
+
+ for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
+ if ((probe_count % (1 << i)) == 0) {
+ alloc_stats_history[i] = alloc_stats_history[i - 1];
+ }
+ }
+ alloc_stats_history[0].swap(t0);
+}
+
+// ===========================================
+// BlueStoreRepairer
+
+size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
+ const interval_set<uint64_t>& extents)
+{
+ ceph_assert(granularity); // initialized
+ // can't call for the second time
+ ceph_assert(!was_filtered_out);
+ ceph_assert(collections_bfs.size() == objects_bfs.size());
+
+ uint64_t prev_pos = 0;
+ uint64_t npos = collections_bfs.size();
+
+ bloom_vector collections_reduced;
+ bloom_vector objects_reduced;
+
+ for (auto e : extents) {
+ if (e.second == 0) {
+ continue;
+ }
+ uint64_t pos = max(e.first / granularity, prev_pos);
+ uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
+ while (pos != npos && pos < end_pos) {
+ ceph_assert( collections_bfs[pos].element_count() ==
+ objects_bfs[pos].element_count());
+ if (collections_bfs[pos].element_count()) {
+ collections_reduced.push_back(std::move(collections_bfs[pos]));
+ objects_reduced.push_back(std::move(objects_bfs[pos]));
+ }
+ ++pos;
+ }
+ prev_pos = end_pos;
+ }
+ collections_reduced.swap(collections_bfs);
+ objects_reduced.swap(objects_bfs);
+ was_filtered_out = true;
+ return collections_bfs.size();
+}
+
+bool BlueStoreRepairer::remove_key(KeyValueDB *db,
+ const string& prefix,
+ const string& key)
+{
+ std::lock_guard l(lock);
+ if (!remove_key_txn) {
+ remove_key_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ remove_key_txn->rmkey(prefix, key);
+
+ return true;
+}
+
+void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
+{
+ std::lock_guard l(lock); // possibly redundant
+ ceph_assert(fix_per_pool_omap_txn == nullptr);
+ fix_per_pool_omap_txn = db->get_transaction();
+ ++to_repair_cnt;
+ bufferlist bl;
+ bl.append(stringify(val));
+ fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
+}
+
+bool BlueStoreRepairer::fix_shared_blob(
+ KeyValueDB::Transaction txn,
+ uint64_t sbid,
+ bluestore_extent_ref_map_t* ref_map,
+ size_t repaired)
+{
+ string key;
+ get_shared_blob_key(sbid, &key);
+ if (ref_map) {
+ bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
+ bufferlist bl;
+ encode(persistent, bl);
+ txn->set(PREFIX_SHARED_BLOB, key, bl);
+ } else {
+ txn->rmkey(PREFIX_SHARED_BLOB, key);
+ }
+ to_repair_cnt += repaired;
+ return true;
+}
+
+bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
+ const string& key,
+ const store_statfs_t& new_statfs)
+{
+ std::lock_guard l(lock);
+ if (!fix_statfs_txn) {
+ fix_statfs_txn = db->get_transaction();
+ }
+ BlueStore::volatile_statfs vstatfs;
+ vstatfs = new_statfs;
+ bufferlist bl;
+ vstatfs.encode(bl);
+ ++to_repair_cnt;
+ fix_statfs_txn->set(PREFIX_STAT, key, bl);
+ return true;
+}
+
+bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len)
+{
+ std::lock_guard l(lock);
+ ceph_assert(!fm->is_null_manager());
+
+ if (!fix_fm_leaked_txn) {
+ fix_fm_leaked_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ fm->release(offset, len, fix_fm_leaked_txn);
+ return true;
+}
+bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len)
+{
+ std::lock_guard l(lock);
+ ceph_assert(!fm->is_null_manager());
+
+ if (!fix_fm_false_free_txn) {
+ fix_fm_false_free_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ fm->allocate(offset, len, fix_fm_false_free_txn);
+ return true;
+}
+
+bool BlueStoreRepairer::fix_spanning_blobs(
+ KeyValueDB* db,
+ std::function<void(KeyValueDB::Transaction)> f)
+{
+ std::lock_guard l(lock);
+ if (!fix_onode_txn) {
+ fix_onode_txn = db->get_transaction();
+ }
+ f(fix_onode_txn);
+ ++to_repair_cnt;
+ return true;
+}
+
+bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
+{
+ //NB: not for use in multithreading mode!!!
+ if (misreferenced_extents.size()) {
+ size_t n = space_usage_tracker.filter_out(misreferenced_extents);
+ ceph_assert(n > 0);
+ if (!fix_misreferences_txn) {
+ fix_misreferences_txn = db->get_transaction();
+ }
+ return true;
+ }
+ return false;
+}
+
+unsigned BlueStoreRepairer::apply(KeyValueDB* db)
+{
+ //NB: not for use in multithreading mode!!!
+ if (fix_per_pool_omap_txn) {
+ auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
+ ceph_assert(ok);
+ fix_per_pool_omap_txn = nullptr;
+ }
+ if (fix_fm_leaked_txn) {
+ auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
+ ceph_assert(ok);
+ fix_fm_leaked_txn = nullptr;
+ }
+ if (fix_fm_false_free_txn) {
+ auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
+ ceph_assert(ok);
+ fix_fm_false_free_txn = nullptr;
+ }
+ if (remove_key_txn) {
+ auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
+ ceph_assert(ok);
+ remove_key_txn = nullptr;
+ }
+ if (fix_misreferences_txn) {
+ auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
+ ceph_assert(ok);
+ fix_misreferences_txn = nullptr;
+ }
+ if (fix_onode_txn) {
+ auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
+ ceph_assert(ok);
+ fix_onode_txn = nullptr;
+ }
+ if (fix_shared_blob_txn) {
+ auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
+ ceph_assert(ok);
+ fix_shared_blob_txn = nullptr;
+ }
+ if (fix_statfs_txn) {
+ auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
+ ceph_assert(ok);
+ fix_statfs_txn = nullptr;
+ }
+ if (need_compact) {
+ db->compact();
+ need_compact = false;
+ }
+ unsigned repaired = to_repair_cnt;
+ to_repair_cnt = 0;
+ return repaired;
+}
+
+// =======================================================
+// RocksDBBlueFSVolumeSelector
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+ ceph_assert(h != nullptr);
+ uint64_t hint = reinterpret_cast<uint64_t>(h);
+ uint8_t res;
+ switch (hint) {
+ case LEVEL_SLOW:
+ res = BlueFS::BDEV_SLOW;
+ if (db_avail4slow > 0) {
+ // considering statically available db space vs.
+ // - observed maximums on DB dev for DB/WAL/UNSORTED data
+ // - observed maximum spillovers
+ uint64_t max_db_use = 0; // max db usage we potentially observed
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+ // this could go to db hence using it in the estimation
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+ auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+ uint64_t avail = min(
+ db_avail4slow,
+ max_db_use < db_total ? db_total - max_db_use : 0);
+
+ // considering current DB dev usage for SLOW data
+ if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+ res = BlueFS::BDEV_DB;
+ }
+ }
+ break;
+ case LEVEL_LOG:
+ case LEVEL_WAL:
+ res = BlueFS::BDEV_WAL;
+ break;
+ case LEVEL_DB:
+ default:
+ res = BlueFS::BDEV_DB;
+ break;
+ }
+ return res;
+}
+
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+ auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
+ res.emplace_back(base, db_size);
+ auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
+ if (slow_size == 0) {
+ slow_size = db_size;
+ }
+ res.emplace_back(base + ".slow", slow_size);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
+ uint8_t res = LEVEL_DB;
+ if (dirname.length() > 5) {
+ // the "db.slow" and "db.wal" directory names are hard-coded at
+ // match up with bluestore. the slow device is always the second
+ // one (when a dedicated block.db device is present and used at
+ // bdev 0). the wal device is always last.
+ if (boost::algorithm::ends_with(dirname, ".slow")) {
+ res = LEVEL_SLOW;
+ }
+ else if (boost::algorithm::ends_with(dirname, ".wal")) {
+ res = LEVEL_WAL;
+ }
+ }
+ return reinterpret_cast<void*>(res);
+}
+
+void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
+ auto max_x = per_level_per_dev_usage.get_max_x();
+ auto max_y = per_level_per_dev_usage.get_max_y();
+
+ sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl;
+ constexpr std::array<const char*, 8> names{ {
+ "DEV/LEV",
+ "WAL",
+ "DB",
+ "SLOW",
+ "*",
+ "*",
+ "REAL",
+ "FILES",
+ } };
+ const size_t width = 12;
+ for (size_t i = 0; i < names.size(); ++i) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << names[i];
+ }
+ sout << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_LOG:
+ sout << "LOG"; break;
+ case LEVEL_WAL:
+ sout << "WAL"; break;
+ case LEVEL_DB:
+ sout << "DB"; break;
+ case LEVEL_SLOW:
+ sout << "SLOW"; break;
+ case LEVEL_MAX:
+ sout << "TOTAL"; break;
+ }
+ for (size_t d = 0; d < max_x; d++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
+ }
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(per_level_files[l]) << std::endl;
+ }
+ ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+ ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+ sout << "MAXIMUMS:" << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_LOG:
+ sout << "LOG"; break;
+ case LEVEL_WAL:
+ sout << "WAL"; break;
+ case LEVEL_DB:
+ sout << "DB"; break;
+ case LEVEL_SLOW:
+ sout << "SLOW"; break;
+ case LEVEL_MAX:
+ sout << "TOTAL"; break;
+ }
+ for (size_t d = 0; d < max_x - 1; d++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
+ }
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
+ sout << std::endl;
+ }
+ string sizes[] = {
+ ">> SIZE <<",
+ stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])),
+ stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])),
+ stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])),
+ };
+ for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << sizes[i];
+ }
+ sout << std::endl;
+}
+
+BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
+ RocksDBBlueFSVolumeSelector* ns =
+ new RocksDBBlueFSVolumeSelector(0, 0, 0,
+ 0, 0, 0,
+ 0, 0, false);
+ return ns;
+}
+
+bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
+ RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
+ ceph_assert(o);
+ bool equal = true;
+ for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
+ for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
+ equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
+ }
+ }
+ for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
+ equal &= (per_level_files[t] == o->per_level_files[t]);
+ }
+ return equal;
+}
+
+// =======================================================
+
+//================================================================================================================
+// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
+// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
+// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
+//
+// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
+// with all the OSD allocation state in a single step during umount().
+// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
+// to losing allocation info in failure cases where we don't call umount.
+// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
+// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
+// where we store the allocation-map during umount().
+//================================================================================================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
+
+static const std::string allocator_dir = "ALLOCATOR_NCB_DIR";
+static const std::string allocator_file = "ALLOCATOR_NCB_FILE";
+static uint32_t s_format_version = 0x01; // support future changes to allocator-map file
+static uint32_t s_serial = 0x01;
+
+#if 1
+#define CEPHTOH_32 le32toh
+#define CEPHTOH_64 le64toh
+#define HTOCEPH_32 htole32
+#define HTOCEPH_64 htole64
+#else
+// help debug the encode/decode by forcing alien format
+#define CEPHTOH_32 be32toh
+#define CEPHTOH_64 be64toh
+#define HTOCEPH_32 htobe32
+#define HTOCEPH_64 htobe64
+#endif
+
+// 48 Bytes header for on-disk alloator image
+const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
+struct allocator_image_header {
+ uint32_t format_version; // 0x00
+ uint32_t valid_signature; // 0x04
+ utime_t timestamp; // 0x08
+ uint32_t serial; // 0x10
+ uint32_t pad[0x7]; // 0x14
+
+ allocator_image_header() {
+ memset((char*)this, 0, sizeof(allocator_image_header));
+ }
+
+ // create header in CEPH format
+ allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
+ this->format_version = format_version;
+ this->timestamp = timestamp;
+ this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
+ this->serial = serial;
+ memset(this->pad, 0, sizeof(this->pad));
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
+ out << "format_version = " << header.format_version << std::endl;
+ out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
+ out << "timestamp = " << header.timestamp << std::endl;
+ out << "serial = " << header.serial << std::endl;
+ for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
+ if (header.pad[i]) {
+ out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
+ }
+ }
+ return out;
+ }
+
+ DENC(allocator_image_header, v, p) {
+ denc(v.format_version, p);
+ denc(v.valid_signature, p);
+ denc(v.timestamp.tv.tv_sec, p);
+ denc(v.timestamp.tv.tv_nsec, p);
+ denc(v.serial, p);
+ for (auto& pad: v.pad) {
+ denc(pad, p);
+ }
+ }
+
+
+ int verify(CephContext* cct, const std::string &path) {
+ if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
+ for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
+ if (this->pad[i]) {
+ derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
+ return -1;
+ }
+ }
+ return 0;
+ }
+ else {
+ derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
+ return -1;
+ }
+ }
+};
+WRITE_CLASS_DENC(allocator_image_header)
+
+// 56 Bytes trailer for on-disk alloator image
+struct allocator_image_trailer {
+ extent_t null_extent; // 0x00
+
+ uint32_t format_version; // 0x10
+ uint32_t valid_signature; // 0x14
+
+ utime_t timestamp; // 0x18
+
+ uint32_t serial; // 0x20
+ uint32_t pad; // 0x24
+ uint64_t entries_count; // 0x28
+ uint64_t allocation_size; // 0x30
+
+ // trailer is created in CEPH format
+ allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
+ memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
+ this->format_version = format_version;
+ this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
+ this->timestamp = timestamp;
+ this->serial = serial;
+ this->pad = 0;
+ this->entries_count = entries_count;
+ this->allocation_size = allocation_size;
+ }
+
+ allocator_image_trailer() {
+ memset((char*)this, 0, sizeof(allocator_image_trailer));
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
+ if (trailer.null_extent.offset || trailer.null_extent.length) {
+ out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
+ out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
+ }
+ out << "format_version = " << trailer.format_version << std::endl;
+ out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
+ out << "timestamp = " << trailer.timestamp << std::endl;
+ out << "serial = " << trailer.serial << std::endl;
+ if (trailer.pad) {
+ out << "trailer.pad= " << trailer.pad << std::endl;
+ }
+ out << "entries_count = " << trailer.entries_count << std::endl;
+ out << "allocation_size = " << trailer.allocation_size << std::endl;
+ return out;
+ }
+
+ int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
+ if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
+
+ // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
+ if (null_extent.offset || null_extent.length) {
+ derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
+ return -1;
+ }
+
+ if (serial != p_header->serial) {
+ derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
+ return -1;
+ }
+
+ if (format_version != p_header->format_version) {
+ derr << "Illegal trailer: header->format_version(" << p_header->format_version
+ << ") != trailer->format_version(" << format_version << ")" << dendl;
+ return -1;
+ }
+
+ if (timestamp != p_header->timestamp) {
+ derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
+ << ") != trailer->timestamp(" << timestamp << ")" << dendl;
+ return -1;
+ }
+
+ if (this->entries_count != entries_count) {
+ derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
+ << this->entries_count << ")" << dendl;
+ return -1;
+ }
+
+ if (this->allocation_size != allocation_size) {
+ derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
+ << this->allocation_size << ")" << dendl;
+ return -1;
+ }
+
+ if (pad) {
+ derr << "Illegal Trailer - pad="<< pad << dendl;
+ return -1;
+ }
+
+ // if arrived here -> trailer is valid !!
+ return 0;
+ } else {
+ derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
+ return -1;
+ }
+ }
+
+ DENC(allocator_image_trailer, v, p) {
+ denc(v.null_extent.offset, p);
+ denc(v.null_extent.length, p);
+ denc(v.format_version, p);
+ denc(v.valid_signature, p);
+ denc(v.timestamp.tv.tv_sec, p);
+ denc(v.timestamp.tv.tv_nsec, p);
+ denc(v.serial, p);
+ denc(v.pad, p);
+ denc(v.entries_count, p);
+ denc(v.allocation_size, p);
+ }
+};
+WRITE_CLASS_DENC(allocator_image_trailer)
+
+
+//-------------------------------------------------------------------------------------
+// invalidate old allocation file if exists so will go directly to recovery after failure
+// we can safely ignore non-existing file
+int BlueStore::invalidate_allocation_file_on_bluefs()
+{
+ // mark that allocation-file was invalidated and we should destage a new copy whne closing db
+ need_to_destage_allocation_file = true;
+ dout(10) << __func__ << " need_to_destage_allocation_file was set" << dendl;
+
+ BlueFS::FileWriter *p_handle = nullptr;
+ if (!bluefs->dir_exists(allocator_dir)) {
+ dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
+ // nothing to do -> return
+ return 0;
+ }
+
+ int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
+ if (ret != 0) {
+ dout(5) << __func__ << " allocator_file(" << allocator_file << ") doesn't exist" << dendl;
+ // nothing to do -> return
+ return 0;
+ }
+
+
+ ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
+ if (ret != 0) {
+ derr << __func__ << "::NCB:: Failed open_for_write with error-code "
+ << ret << dendl;
+ return -1;
+ }
+
+ dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
+ ret = bluefs->truncate(p_handle, 0);
+ if (ret != 0) {
+ derr << __func__ << "::NCB:: Failed truncaste with error-code "
+ << ret << dendl;
+ bluefs->close_writer(p_handle);
+ return -1;
+ }
+
+ bluefs->fsync(p_handle);
+ bluefs->close_writer(p_handle);
+
+ return 0;
+}
+
+//-----------------------------------------------------------------------------------
+int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
+{
+ *p_num_entries = 0;
+ auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+ (*p_num_entries)++;
+ };
+ src_alloc->foreach(count_entries);
+
+ dout(5) << "count num_entries=" << *p_num_entries << dendl;
+
+ // add 16K extra entries in case new allocation happened
+ (*p_num_entries) += 16*1024;
+ unique_ptr<extent_t[]> arr;
+ try {
+ arr = make_unique<extent_t[]>(*p_num_entries);
+ } catch (std::bad_alloc&) {
+ derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
+ return -1;
+ }
+
+ uint64_t idx = 0;
+ auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+ if (extent_length > 0) {
+ if (idx < *p_num_entries) {
+ arr[idx] = {extent_offset, extent_length};
+ }
+ idx++;
+ }
+ else {
+ derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
+ }
+ };
+ src_alloc->foreach(copy_entries);
+
+ dout(5) << "copy num_entries=" << idx << dendl;
+ if (idx > *p_num_entries) {
+ derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
+ ceph_assert(idx <= *p_num_entries);
+ }
+
+ *p_num_entries = idx;
+
+ for (idx = 0; idx < *p_num_entries; idx++) {
+ const extent_t *p_extent = &arr[idx];
+ dest_alloc->init_add_free(p_extent->offset, p_extent->length);
+ }
+
+ return 0;
+}
+
+//-----------------------------------------------------------------------------------
+static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
+{
+ std::ptrdiff_t length = p_curr - buffer;
+ p_handle->append(buffer, length);
+
+ crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
+ uint32_t encoded_crc = HTOCEPH_32(crc);
+ p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
+
+ return crc;
+}
+
+const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
+// write the allocator to a flat bluefs file - 4K extents at a time
+//-----------------------------------------------------------------------------------
+int BlueStore::store_allocator(Allocator* src_allocator)
+{
+ // when storing allocations to file we must be sure there is no background compactions
+ // the easiest way to achieve it is to make sure db is closed
+ ceph_assert(db == nullptr);
+ utime_t start_time = ceph_clock_now();
+ int ret = 0;
+
+ // create dir if doesn't exist already
+ if (!bluefs->dir_exists(allocator_dir) ) {
+ ret = bluefs->mkdir(allocator_dir);
+ if (ret != 0) {
+ derr << "Failed mkdir with error-code " << ret << dendl;
+ return -1;
+ }
+ }
+ bluefs->compact_log();
+ // reuse previous file-allocation if exists
+ ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
+ bool overwrite_file = (ret == 0);
+ BlueFS::FileWriter *p_handle = nullptr;
+ ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
+ if (ret != 0) {
+ derr << __func__ << "Failed open_for_write with error-code " << ret << dendl;
+ return -1;
+ }
+
+ uint64_t file_size = p_handle->file->fnode.size;
+ uint64_t allocated = p_handle->file->fnode.get_allocated();
+ dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
+
+ bluefs->sync_metadata(false);
+ unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
+ if (!allocator) {
+ bluefs->close_writer(p_handle);
+ return -1;
+ }
+
+ // store all extents (except for the bluefs extents we removed) in a single flat file
+ utime_t timestamp = ceph_clock_now();
+ uint32_t crc = -1;
+ {
+ allocator_image_header header(timestamp, s_format_version, s_serial);
+ bufferlist header_bl;
+ encode(header, header_bl);
+ crc = header_bl.crc32c(crc);
+ encode(crc, header_bl);
+ p_handle->append(header_bl);
+ }
+
+ crc = -1; // reset crc
+ extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
+ extent_t *p_curr = buffer;
+ const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER;
+ uint64_t extent_count = 0;
+ uint64_t allocation_size = 0;
+ auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
+ if (extent_length == 0) {
+ derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
+ ret = -1;
+ return;
+ }
+ p_curr->offset = HTOCEPH_64(extent_offset);
+ p_curr->length = HTOCEPH_64(extent_length);
+ extent_count++;
+ allocation_size += extent_length;
+ p_curr++;
+
+ if (p_curr == p_end) {
+ crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
+ p_curr = buffer; // recycle the buffer
+ }
+ };
+ allocator->foreach(iterated_allocation);
+ // if got null extent -> fail the operation
+ if (ret != 0) {
+ derr << "Illegal extent, fail store operation" << dendl;
+ derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
+ bluefs->truncate(p_handle, 0);
+ bluefs->close_writer(p_handle);
+ return -1;
+ }
+
+ // if we got any leftovers -> add crc and append to file
+ if (p_curr > buffer) {
+ crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
+ }
+
+ {
+ allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
+ bufferlist trailer_bl;
+ encode(trailer, trailer_bl);
+ uint32_t crc = -1;
+ crc = trailer_bl.crc32c(crc);
+ encode(crc, trailer_bl);
+ p_handle->append(trailer_bl);
+ }
+
+ bluefs->fsync(p_handle);
+ bluefs->truncate(p_handle, p_handle->pos);
+ bluefs->fsync(p_handle);
+
+ utime_t duration = ceph_clock_now() - start_time;
+ dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
+ dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
+
+ bluefs->close_writer(p_handle);
+ need_to_destage_allocation_file = false;
+ return 0;
+}
+
+//-----------------------------------------------------------------------------------
+Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
+ // create allocator
+ uint64_t alloc_size = min_alloc_size;
+ Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
+ zone_size, first_sequential_zone,
+ "recovery");
+ if (alloc) {
+ return alloc;
+ } else {
+ derr << "Failed Allocator Creation" << dendl;
+ return nullptr;
+ }
+}
+
+//-----------------------------------------------------------------------------------
+size_t calc_allocator_image_header_size()
+{
+ utime_t timestamp = ceph_clock_now();
+ allocator_image_header header(timestamp, s_format_version, s_serial);
+ bufferlist header_bl;
+ encode(header, header_bl);
+ uint32_t crc = -1;
+ crc = header_bl.crc32c(crc);
+ encode(crc, header_bl);
+
+ return header_bl.length();
+}
+
+//-----------------------------------------------------------------------------------
+int calc_allocator_image_trailer_size()
+{
+ utime_t timestamp = ceph_clock_now();
+ uint64_t extent_count = -1;
+ uint64_t allocation_size = -1;
+ uint32_t crc = -1;
+ bufferlist trailer_bl;
+ allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
+
+ encode(trailer, trailer_bl);
+ crc = trailer_bl.crc32c(crc);
+ encode(crc, trailer_bl);
+ return trailer_bl.length();
+}
+
+//-----------------------------------------------------------------------------------
+int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
+{
+ if (cct->_conf->bluestore_debug_inject_allocation_from_file_failure > 0) {
+ boost::mt11213b rng(time(NULL));
+ boost::uniform_real<> ur(0, 1);
+ if (ur(rng) < cct->_conf->bluestore_debug_inject_allocation_from_file_failure) {
+ derr << __func__ << " failure injected." << dendl;
+ return -1;
+ }
+ }
+ utime_t start_time = ceph_clock_now();
+ BlueFS::FileReader *p_temp_handle = nullptr;
+ int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
+ if (ret != 0) {
+ dout(1) << "Failed open_for_read with error-code " << ret << dendl;
+ return -1;
+ }
+ unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
+ uint64_t read_alloc_size = 0;
+ uint64_t file_size = p_handle->file->fnode.size;
+ dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
+
+ // make sure we were able to store a valid copy
+ if (file_size == 0) {
+ dout(1) << "No Valid allocation info on disk (empty file)" << dendl;
+ return -1;
+ }
+
+ // first read the header
+ size_t offset = 0;
+ allocator_image_header header;
+ int header_size = calc_allocator_image_header_size();
+ {
+ bufferlist header_bl,temp_bl;
+ int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
+ if (read_bytes != header_size) {
+ derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
+ return -1;
+ }
+
+ offset += read_bytes;
+
+ header_bl.claim_append(temp_bl);
+ auto p = header_bl.cbegin();
+ decode(header, p);
+ if (header.verify(cct, path) != 0 ) {
+ derr << "header = \n" << header << dendl;
+ return -1;
+ }
+
+ uint32_t crc_calc = -1, crc;
+ crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
+ decode(crc, p);
+ if (crc != crc_calc) {
+ derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
+ derr << "header = \n" << header << dendl;
+ return -1;
+ }
+
+ // increment version for next store
+ s_serial = header.serial + 1;
+ }
+
+ // then read the payload (extents list) using a recycled buffer
+ extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
+ uint32_t crc = -1;
+ int trailer_size = calc_allocator_image_trailer_size();
+ uint64_t extent_count = 0;
+ uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
+ while (extents_bytes_left) {
+ int req_bytes = std::min(extents_bytes_left, static_cast<uint64_t>(sizeof(buffer)));
+ int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
+ if (read_bytes != req_bytes) {
+ derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
+ return -1;
+ }
+
+ offset += read_bytes;
+ extents_bytes_left -= read_bytes;
+
+ const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t);
+ const extent_t *p_end = buffer + num_extent_in_buffer;
+ for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
+ uint64_t offset = CEPHTOH_64(p_ext->offset);
+ uint64_t length = CEPHTOH_64(p_ext->length);
+ read_alloc_size += length;
+
+ if (length > 0) {
+ allocator->init_add_free(offset, length);
+ extent_count ++;
+ } else {
+ derr << "extent with zero length at idx=" << extent_count << dendl;
+ return -1;
+ }
+ }
+
+ uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
+ read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
+ if (read_bytes == sizeof(crc) ) {
+ crc = CEPHTOH_32(crc);
+ if (crc != calc_crc) {
+ derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
+ derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
+ return -1;
+ }
+
+ offset += read_bytes;
+ if (extents_bytes_left) {
+ extents_bytes_left -= read_bytes;
+ }
+ } else {
+ derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
+ return -1;
+ }
+
+ }
+
+ // finally, read the trailer and verify it is in good shape and that we got all the extents
+ {
+ bufferlist trailer_bl,temp_bl;
+ int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
+ if (read_bytes != trailer_size) {
+ derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
+ return -1;
+ }
+ offset += read_bytes;
+
+ trailer_bl.claim_append(temp_bl);
+ uint32_t crc_calc = -1;
+ uint32_t crc;
+ allocator_image_trailer trailer;
+ auto p = trailer_bl.cbegin();
+ decode(trailer, p);
+ if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
+ derr << "trailer=\n" << trailer << dendl;
+ return -1;
+ }
+
+ crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
+ decode(crc, p);
+ if (crc != crc_calc) {
+ derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
+ derr << "trailer=\n" << trailer << dendl;
+ return -1;
+ }
+ }
+
+ utime_t duration = ceph_clock_now() - start_time;
+ dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
+ << read_alloc_size << ", file_size=" << file_size << dendl;
+ dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
+ *num = extent_count;
+ *bytes = read_alloc_size;
+ return 0;
+}
+
+//-----------------------------------------------------------------------------------
+int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
+{
+ utime_t start = ceph_clock_now();
+ auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
+ int ret = __restore_allocator(temp_allocator.get(), num, bytes);
+ if (ret != 0) {
+ return ret;
+ }
+
+ uint64_t num_entries = 0;
+ dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
+ copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
+ utime_t duration = ceph_clock_now() - start;
+ dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
+ return ret;
+}
+
+//-----------------------------------------------------------------------------------
+void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
+{
+ dout(30) << __func__ << " 0x" << std::hex
+ << offset << "~" << length
+ << " " << min_alloc_size_mask
+ << dendl;
+ ceph_assert((offset & min_alloc_size_mask) == 0);
+ ceph_assert((length & min_alloc_size_mask) == 0);
+ sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
+}
+
+void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ ceph_assert(per_pool_statfs);
+ ceph_assert(oid != ghobject_t());
+
+ auto &blob = b->get_blob();
+ if(spanning) {
+ dout(20) << __func__ << " " << spanning << " " << b->id << dendl;
+ ceph_assert(b->id >= 0);
+ spanning_blobs[b->id] = b;
+ ++stats.spanning_blob_count;
+ } else {
+ dout(20) << __func__ << " " << spanning << " " << extent_no << dendl;
+ blobs[extent_no] = b;
+ }
+ bool compressed = blob.is_compressed();
+ if (!blob.is_shared()) {
+ for (auto& pe : blob.get_extents()) {
+ if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) {
+ ++stats.skipped_illegal_extent;
+ continue;
+ }
+ store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length);
+
+ per_pool_statfs->allocated() += pe.length;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() += pe.length;
+ }
+ }
+ if (compressed) {
+ per_pool_statfs->compressed() +=
+ blob.get_compressed_payload_length();
+ ++stats.compressed_blob_count;
+ }
+ } else {
+ auto it = sb_info.find(sbid);
+ if (it == sb_info.end()) {
+ derr << __func__ << " shared blob not found:" << sbid
+ << dendl;
+ }
+ auto &sbi = *it;
+ auto pool_id = oid.hobj.get_logical_pool();
+ if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) {
+ sbi.pool_id = pool_id;
+ size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order;
+ per_pool_statfs->allocated() += alloc_delta;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() += alloc_delta;
+ ++stats.compressed_blob_count;
+ }
+ }
+ if (compressed) {
+ per_pool_statfs->compressed() +=
+ blob.get_compressed_payload_length();
+ }
+ }
+}
+
+void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le,
+ bool spanning,
+ uint64_t blobid)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
+ auto &map = spanning ? spanning_blobs : blobs;
+ auto it = map.find(blobid);
+ ceph_assert(it != map.end());
+ per_pool_statfs->stored() += le->length;
+ if (it->second->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
+
+void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ _consume_new_blob(false, extent_no, sbid, b);
+ per_pool_statfs->stored() += le->length;
+ if (b->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
+
+void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid,
+ BlobRef b)
+{
+ _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
+}
+
+void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid,
+ volatile_statfs* _per_pool_statfs)
+{
+ oid = _oid;
+ per_pool_statfs = _per_pool_statfs;
+ blob_map_t empty;
+ blob_map_t empty2;
+ std::swap(blobs, empty);
+ std::swap(spanning_blobs, empty2);
+}
+
+int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
+{
+ sb_info_space_efficient_map_t sb_info;
+ // iterate over all shared blobs
+ auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+ if (!it) {
+ derr << "failed getting shared blob's iterator" << dendl;
+ return -ENOENT;
+ }
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ const auto& key = it->key();
+ dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl;
+ uint64_t sbid = 0;
+ if (get_key_shared_blob(key, &sbid) != 0) {
+ derr << __func__ << " bad shared blob key '" << pretty_binary_string(key)
+ << "'" << dendl;
+ }
+ bluestore_shared_blob_t shared_blob(sbid);
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ decode(shared_blob, blp);
+ }
+ catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to decode Shared Blob"
+ << pretty_binary_string(key) << dendl;
+ continue;
+ }
+ dout(20) << __func__ << " " << shared_blob << dendl;
+ uint64_t allocated = 0;
+ for (auto& r : shared_blob.ref_map.ref_map) {
+ ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET);
+ set_allocation_in_simple_bmap(sbmap, r.first, r.second.length);
+ allocated += r.second.length;
+ }
+ auto &sbi = sb_info.add_or_adopt(sbid);
+ ceph_assert(p2phase(allocated, min_alloc_size) == 0);
+ sbi.allocated_chunks += (allocated >> min_alloc_size_order);
+ ++stats.shared_blob_count;
+ }
+ }
+
+ it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (!it) {
+ derr << "failed getting onode's iterator" << dendl;
+ return -ENOENT;
+ }
+
+ uint64_t kv_count = 0;
+ uint64_t count_interval = 1'000'000;
+ ExtentDecoderPartial edecoder(*this,
+ stats,
+ *sbmap,
+ sb_info,
+ min_alloc_size_order);
+
+ // iterate over all ONodes stored in RocksDB
+ for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
+ // trace an even after every million processed objects (typically every 5-10 seconds)
+ if (kv_count && (kv_count % count_interval == 0) ) {
+ dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
+ }
+
+ auto key = it->key();
+ auto okey = key;
+ dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
+ ghobject_t oid;
+ if (!is_extent_shard_key(it->key())) {
+ int r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode key = "
+ << pretty_binary_string(okey) << dendl;
+ return -EIO;
+ }
+ edecoder.reset(oid,
+ &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
+ Onode dummy_on(cct);
+ Onode::decode_raw(&dummy_on,
+ it->value(),
+ edecoder);
+ ++stats.onode_count;
+ } else {
+ uint32_t offset;
+ int r = get_key_extent_shard(key, &okey, &offset);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode extent key = "
+ << pretty_binary_string(key) << dendl;
+ return -EIO;
+ }
+ r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__
+ << " failed to decode onode key= " << pretty_binary_string(okey)
+ << " from extent key= " << pretty_binary_string(key)
+ << dendl;
+ return -EIO;
+ }
+ ceph_assert(oid == edecoder.get_oid());
+ edecoder.decode_some(it->value(), nullptr);
+ ++stats.shard_count;
+ }
+ }
+
+ std::lock_guard l(vstatfs_lock);
+ store_statfs_t s;
+ osd_pools.clear();
+ for (auto& p : stats.actual_pool_vstatfs) {
+ if (per_pool_stat_collection) {
+ osd_pools[p.first] = p.second;
+ }
+ stats.actual_store_vstatfs += p.second;
+ p.second.publish(&s);
+ dout(5) << __func__ << " recovered pool "
+ << std::hex
+ << p.first << "->" << s
+ << std::dec
+ << " per-pool:" << per_pool_stat_collection
+ << dendl;
+ }
+ vstatfs = stats.actual_store_vstatfs;
+ vstatfs.publish(&s);
+ dout(5) << __func__ << " recovered " << s
+ << dendl;
+ return 0;
+}
+
+//---------------------------------------------------------
+int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
+{
+ // first set space used by superblock
+ auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
+ set_allocation_in_simple_bmap(sbmap, 0, super_length);
+ stats.extent_count++;
+
+ // then set all space taken by Objects
+ int ret = read_allocation_from_onodes(sbmap, stats);
+ if (ret < 0) {
+ derr << "failed read_allocation_from_onodes()" << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+//-----------------------------------------------------------------------------------
+static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
+{
+ int alloc_size_shift = std::countr_zero(alloc_size);
+ uint64_t offset = 0;
+ extent_t ext = sbmap->get_next_clr_extent(offset);
+ while (ext.length != 0) {
+ dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
+ offset = ext.offset + ext.length;
+ ext = sbmap->get_next_clr_extent(offset);
+ }
+}
+
+//---------------------------------------------------------
+int BlueStore::read_allocation_from_drive_on_startup()
+{
+ int ret = 0;
+
+ ret = _open_collections();
+ if (ret < 0) {
+ return ret;
+ }
+ auto shutdown_cache = make_scope_guard([&] {
+ _shutdown_cache();
+ });
+
+ utime_t start = ceph_clock_now();
+ read_alloc_stats_t stats = {};
+ SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
+ ret = reconstruct_allocations(&sbmap, stats);
+ if (ret != 0) {
+ return ret;
+ }
+
+ copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
+
+ utime_t duration = ceph_clock_now() - start;
+ dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
+ return ret;
+}
+
+
+
+
+// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
+// Not meant to be run by customers
+#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+
+#include <stdlib.h>
+#include <algorithm>
+//---------------------------------------------------------
+int cmpfunc (const void * a, const void * b)
+{
+ if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
+ return 1;
+ }
+ else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
+ return -1;
+ }
+ else {
+ return 0;
+ }
+}
+
+// compare the allocator built from Onodes with the system allocator (CF-B)
+//---------------------------------------------------------
+int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
+{
+ uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
+ uint64_t extent_count = allocation_size/sizeof(extent_t);
+ dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
+
+ unique_ptr<extent_t[]> arr1;
+ unique_ptr<extent_t[]> arr2;
+ try {
+ arr1 = make_unique<extent_t[]>(extent_count);
+ arr2 = make_unique<extent_t[]>(extent_count);
+ } catch (std::bad_alloc&) {
+ derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
+ return -1;
+ }
+
+ // copy the extents from the allocators into simple array and then compare them
+ uint64_t size1 = 0, size2 = 0;
+ uint64_t idx1 = 0, idx2 = 0;
+ auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
+ size1 += length;
+ if (idx1 < extent_count) {
+ arr1[idx1++] = {offset, length};
+ }
+ else if (idx1 == extent_count) {
+ derr << "(2)compare_allocators:: spillover" << dendl;
+ idx1 ++;
+ }
+
+ };
+
+ auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
+ size2 += length;
+ if (idx2 < extent_count) {
+ arr2[idx2++] = {offset, length};
+ }
+ else if (idx2 == extent_count) {
+ derr << "(2)compare_allocators:: spillover" << dendl;
+ idx2 ++;
+ }
+ };
+
+ alloc1->foreach(iterated_mapper1);
+ alloc2->foreach(iterated_mapper2);
+
+ qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
+ qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
+
+ if (idx1 == idx2) {
+ idx1 = idx2 = std::min(idx1, extent_count);
+ if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
+ return 0;
+ }
+ derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl;
+ for (uint64_t i = 0; i < idx1; i++) {
+ if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
+ derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
+ derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
+ return -1;
+ }
+ }
+ return 0;
+ } else {
+ derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
+ return -1;
+ }
+}
+
+//---------------------------------------------------------
+int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
+{
+ // then add space used by bluefs to store rocksdb
+ unsigned extent_count = 0;
+ if (bluefs) {
+ bluefs->foreach_block_extents(
+ bluefs_layout.shared_bdev,
+ [&](uint64_t start, uint32_t len) {
+ allocator->init_rm_free(start, len);
+ stats.extent_count++;
+ }
+ );
+ }
+
+ dout(5) << "bluefs extent_count=" << extent_count << dendl;
+ return 0;
+}
+
+//---------------------------------------------------------
+int BlueStore::read_allocation_from_drive_for_bluestore_tool()
+{
+ dout(5) << __func__ << dendl;
+ int ret = 0;
+ uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+ ret = _open_db_and_around(true, false);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = _open_collections();
+ if (ret < 0) {
+ _close_db_and_around();
+ return ret;
+ }
+
+ utime_t duration;
+ read_alloc_stats_t stats = {};
+ utime_t start = ceph_clock_now();
+
+ auto shutdown_cache = make_scope_guard([&] {
+ dout(1) << "Allocation Recovery was completed in " << duration
+ << " seconds; insert_count=" << stats.insert_count
+ << "; extent_count=" << stats.extent_count << dendl;
+ _shutdown_cache();
+ _close_db_and_around();
+ });
+
+ {
+ auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
+ //reconstruct allocations into a temp simple-bitmap and copy into allocator
+ {
+ SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
+ ret = reconstruct_allocations(&sbmap, stats);
+ if (ret != 0) {
+ return ret;
+ }
+ copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
+ }
+
+ // add allocation space used by the bluefs itself
+ ret = add_existing_bluefs_allocation(allocator.get(), stats);
+ if (ret < 0) {
+ return ret;
+ }
+
+ duration = ceph_clock_now() - start;
+ stats.insert_count = 0;
+ auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+ stats.insert_count++;
+ };
+ allocator->foreach(count_entries);
+ ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
+ if (ret == 0) {
+ dout(5) << "Allocator drive - file integrity check OK" << dendl;
+ } else {
+ derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
+ }
+ }
+
+ dout(1) << stats << dendl;
+ return ret;
+}
+
+//---------------------------------------------------------
+Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
+{
+ uint64_t bdev_size = bdev->get_size();
+ Allocator* allocator = create_bitmap_allocator(bdev_size);
+ if (allocator) {
+ dout(5) << "bitmap-allocator=" << allocator << dendl;
+ } else {
+ derr << "****failed create_bitmap_allocator()" << dendl;
+ return nullptr;
+ }
+
+ uint64_t num_entries = 0;
+ copy_allocator(src_allocator, allocator, &num_entries);
+
+ // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
+ // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
+ {
+ bluefs->foreach_block_extents(
+ bluefs_layout.shared_bdev,
+ [&] (uint64_t start, uint32_t len) {
+ allocator->init_add_free(start, len);
+ }
+ );
+ }
+
+ return allocator;
+}
+
+//---------------------------------------------------------
+static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
+{
+ dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
+ db->submit_transaction_sync(t);
+}
+
+//---------------------------------------------------------
+void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
+{
+ unsigned max_txn = 1024;
+ dout(5) << "max_transaction_submit=" << max_txn << dendl;
+ uint64_t size = 0, idx = 0;
+ KeyValueDB::Transaction txn = db->get_transaction();
+ auto iterated_insert = [&](uint64_t offset, uint64_t length) {
+ size += length;
+ real_fm->release(offset, length, txn);
+ if ((++idx % max_txn) == 0) {
+ db->submit_transaction_sync(txn);
+ txn = db->get_transaction();
+ }
+ };
+ allocator->foreach(iterated_insert);
+ if (idx % max_txn != 0) {
+ db->submit_transaction_sync(txn);
+ }
+ dout(5) << "size=" << size << ", num extents=" << idx << dendl;
+}
+
+//---------------------------------------------------------
+Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
+{
+ dout(5) << "real_fm->enumerate_next" << dendl;
+ Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
+ if (allocator2) {
+ dout(5) << "bitmap-allocator=" << allocator2 << dendl;
+ } else {
+ return nullptr;
+ }
+
+ uint64_t size2 = 0, idx2 = 0;
+ real_fm->enumerate_reset();
+ uint64_t offset, length;
+ while (real_fm->enumerate_next(db, &offset, &length)) {
+ allocator2->init_add_free(offset, length);
+ ++idx2;
+ size2 += length;
+ }
+ real_fm->enumerate_reset();
+
+ dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
+ return allocator2;
+}
+
+//---------------------------------------------------------
+// close the active fm and open it in a new mode like makefs()
+// but make sure to mark the full device space as allocated
+// later we will mark all exetents from the allocator as free
+int BlueStore::reset_fm_for_restore()
+{
+ dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
+ fm->shutdown();
+ delete fm;
+ fm = nullptr;
+ freelist_type = "bitmap";
+ KeyValueDB::Transaction t = db->get_transaction();
+ // call _open_fm() with fm_restore set to TRUE
+ // this will mark the full device space as allocated (and not just the reserved space)
+ _open_fm(t, true, true, true);
+ if (fm == nullptr) {
+ derr << "Failed _open_fm()" << dendl;
+ return -1;
+ }
+ db->submit_transaction_sync(t);
+ ceph_assert(!fm->is_null_manager());
+ dout(5) << "fm was reactivated in full mode" << dendl;
+ return 0;
+}
+
+
+//---------------------------------------------------------
+// create a temp allocator filled with allocation state from the fm
+// and compare it to the base allocator passed in
+int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
+{
+ dout(5) << "verify that alloc content is identical to FM" << dendl;
+ // initialize from freelist
+ Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
+ if (temp_allocator == nullptr) {
+ return -1;
+ }
+
+ uint64_t insert_count = 0;
+ auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+ insert_count++;
+ };
+ temp_allocator->foreach(count_entries);
+ uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+ int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
+
+ delete temp_allocator;
+
+ if (ret == 0) {
+ dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
+ return 0;
+ } else {
+ derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
+ return -1;
+ }
+}
+
+//---------------------------------------------------------
+int BlueStore::db_cleanup(int ret)
+{
+ _shutdown_cache();
+ _close_db_and_around();
+ return ret;
+}
+
+//---------------------------------------------------------
+// convert back the system from null-allocator to using rocksdb to store allocation
+int BlueStore::push_allocation_to_rocksdb()
+{
+ if (cct->_conf->bluestore_allocation_from_file) {
+ derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
+ derr << "please change default to false in ceph.conf file>" << dendl;
+ return -1;
+ }
+
+ dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
+ int ret = _open_db_and_around(false);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!fm->is_null_manager()) {
+ derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
+ return db_cleanup(0);
+ }
+
+ // start by creating a clone copy of the shared-allocator
+ unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
+ if (!allocator) {
+ return db_cleanup(-1);
+ }
+
+ // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
+ clear_allocation_objects_from_rocksdb(db, cct, path);
+
+ // then open fm in new mode with the full devie marked as alloctaed
+ if (reset_fm_for_restore() != 0) {
+ return db_cleanup(-1);
+ }
+
+ // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
+ copy_allocator_content_to_fm(allocator.get(), fm);
+
+ // compare the allocator info with the info stored in the fm/rocksdb
+ if (verify_rocksdb_allocations(allocator.get()) == 0) {
+ // all is good -> we can commit to rocksdb allocator
+ commit_to_real_manager();
+ } else {
+ return db_cleanup(-1);
+ }
+
+ // can't be too paranoid :-)
+ dout(5) << "Running full scale verification..." << dendl;
+ // close db/fm/allocator and start fresh
+ db_cleanup(0);
+ dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
+ ret = _open_db_and_around(true);
+ if (ret < 0) {
+ return db_cleanup(ret);
+ }
+ ceph_assert(!fm->is_null_manager());
+ ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
+
+ return db_cleanup(ret);
+}
+
+#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+
+//-------------------------------------------------------------------------------------
+int BlueStore::commit_freelist_type()
+{
+ // When freelist_type to "bitmap" we will store allocation in RocksDB
+ // When allocation-info is stored in a single file we set freelist_type to "null"
+ // This will direct the startup code to read allocation from file and not RocksDB
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (t == nullptr) {
+ derr << "db->get_transaction() failed!!!" << dendl;
+ return -1;
+ }
+
+ bufferlist bl;
+ bl.append(freelist_type);
+ t->set(PREFIX_SUPER, "freelist_type", bl);
+
+ int ret = db->submit_transaction_sync(t);
+ if (ret != 0) {
+ derr << "Failed db->submit_transaction_sync(t)" << dendl;
+ }
+ return ret;
+}
+
+//-------------------------------------------------------------------------------------
+int BlueStore::commit_to_null_manager()
+{
+ dout(5) << __func__ << " Set FreelistManager to NULL FM..." << dendl;
+ fm->set_null_manager();
+ freelist_type = "null";
+#if 1
+ return commit_freelist_type();
+#else
+ // should check how long this step take on a big configuration as deletes are expensive
+ if (commit_freelist_type() == 0) {
+ // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
+ clear_allocation_objects_from_rocksdb(db, cct, path);
+ }
+#endif
+}
+
+
+//-------------------------------------------------------------------------------------
+int BlueStore::commit_to_real_manager()
+{
+ dout(5) << "Set FreelistManager to Real FM..." << dendl;
+ ceph_assert(!fm->is_null_manager());
+ freelist_type = "bitmap";
+ int ret = commit_freelist_type();
+ if (ret == 0) {
+ //remove the allocation_file
+ invalidate_allocation_file_on_bluefs();
+ ret = bluefs->unlink(allocator_dir, allocator_file);
+ bluefs->sync_metadata(false);
+ if (ret == 0) {
+ dout(5) << "Remove Allocation File successfully" << dendl;
+ }
+ else {
+ derr << "Remove Allocation File ret_code=" << ret << dendl;
+ }
+ }
+
+ return ret;
+}
+
+//================================================================================================================
+//================================================================================================================