// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2014 Red Hat * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include #include #include #include #include #include #include #include "include/cpp-btree/btree_set.h" #include "BlueStore.h" #include "bluestore_common.h" #include "os/kv.h" #include "include/compat.h" #include "include/intarith.h" #include "include/stringify.h" #include "include/str_map.h" #include "include/util.h" #include "common/errno.h" #include "common/safe_io.h" #include "common/PriorityCache.h" #include "common/RWLock.h" #include "Allocator.h" #include "FreelistManager.h" #include "BlueFS.h" #include "BlueRocksEnv.h" #include "auth/Crypto.h" #include "common/EventTrace.h" #include "perfglue/heap_profiler.h" #include "common/blkdev.h" #include "common/numa.h" #include "common/pretty_binary.h" #if defined(WITH_LTTNG) #define TRACEPOINT_DEFINE #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE #include "tracing/bluestore.h" #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE #undef TRACEPOINT_DEFINE #else #define tracepoint(...) #endif #define dout_context cct #define dout_subsys ceph_subsys_bluestore using bid_t = decltype(BlueStore::Blob::id); // bluestore_cache_onode MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode, bluestore_cache_onode); // bluestore_cache_other MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer, bluestore_Buffer); MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent, bluestore_Extent); MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob, bluestore_Blob); MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob, bluestore_SharedBlob); // bluestore_txc MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext, bluestore_txc); using std::deque; using std::min; using std::make_pair; using std::numeric_limits; using std::pair; using std::list; using std::map; using std::max; using std::ostream; using std::ostringstream; using std::set; using std::string; using std::stringstream; using std::vector; using ceph::bufferlist; using ceph::bufferptr; using ceph::coarse_mono_clock; using ceph::decode; using ceph::encode; using ceph::Formatter; using ceph::JSONFormatter; using ceph::make_timespan; using ceph::mono_clock; using ceph::mono_time; using ceph::timespan_str; // kv store prefixes const string PREFIX_SUPER = "S"; // field -> value const string PREFIX_STAT = "T"; // field -> value(int64 array) const string PREFIX_COLL = "C"; // collection name -> cnode_t const string PREFIX_OBJ = "O"; // object name -> onode_t const string PREFIX_OMAP = "M"; // u64 + keyname -> value const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll) const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager) const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager) const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager) const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata) const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs"; // write a label in the first block. always use this size. note that // bluefs makes a matching assumption about the location of its // superblock (always the second block of the device). #define BDEV_LABEL_BLOCK_SIZE 4096 // reserve: label (4k) + bluefs super (4k), which means we start at 8k. #define SUPER_RESERVED 8192 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits /* * extent map blob encoding * * we use the low bits of the blobid field to indicate some common scenarios * and spanning vs local ids. See ExtentMap::{encode,decode}_some(). */ #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id #define BLOBID_SHIFT_BITS 4 /* * object name key structure * * encoded u8: shard + 2^7 (so that it sorts properly) * encoded u64: poolid + 2^63 (so that it sorts properly) * encoded u32: hash (bit reversed) * * escaped string: namespace * * escaped string: key or object name * 1 char: '<', '=', or '>'. if =, then object key == object name, and * we are done. otherwise, we are followed by the object name. * escaped string: object name (unless '=' above) * * encoded u64: snap * encoded u64: generation * 'o' */ #define ONODE_KEY_SUFFIX 'o' /* * extent shard key * * object prefix key * u32 * 'x' */ #define EXTENT_SHARD_KEY_SUFFIX 'x' /* * string encoding in the key * * The key string needs to lexicographically sort the same way that * ghobject_t does. We do this by escaping anything <= to '#' with # * plus a 2 digit hex string, and anything >= '~' with ~ plus the two * hex digits. * * We use ! as a terminator for strings; this works because it is < # * and will get escaped if it is present in the string. * * NOTE: There is a bug in this implementation: due to implicit * character type conversion in comparison it may produce unexpected * ordering. Unfortunately fixing the bug would mean invalidating the * keys in existing deployments. Instead we do additional sorting * where it is needed. */ template static void append_escaped(const string &in, S *out) { char hexbyte[in.length() * 3 + 1]; char* ptr = &hexbyte[0]; for (string::const_iterator i = in.begin(); i != in.end(); ++i) { if (*i <= '#') { // bug: unexpected result for *i > 0x7f *ptr++ = '#'; *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; *ptr++ = "0123456789abcdef"[*i & 0x0f]; } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f *ptr++ = '~'; *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; *ptr++ = "0123456789abcdef"[*i & 0x0f]; } else { *ptr++ = *i; } } *ptr++ = '!'; out->append(hexbyte, ptr - &hexbyte[0]); } inline unsigned h2i(char c) { if ((c >= '0') && (c <= '9')) { return c - 0x30; } else if ((c >= 'a') && (c <= 'f')) { return c - 'a' + 10; } else if ((c >= 'A') && (c <= 'F')) { return c - 'A' + 10; } else { return 256; // make it always larger than 255 } } static int decode_escaped(const char *p, string *out) { char buff[256]; char* ptr = &buff[0]; char* max = &buff[252]; const char *orig_p = p; while (*p && *p != '!') { if (*p == '#' || *p == '~') { unsigned hex = 0; p++; hex = h2i(*p++) << 4; if (hex > 255) { return -EINVAL; } hex |= h2i(*p++); if (hex > 255) { return -EINVAL; } *ptr++ = hex; } else { *ptr++ = *p++; } if (ptr > max) { out->append(buff, ptr-buff); ptr = &buff[0]; } } if (ptr != buff) { out->append(buff, ptr-buff); } return p - orig_p; } template static void _key_encode_shard(shard_id_t shard, T *key) { key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80)); } static const char *_key_decode_shard(const char *key, shard_id_t *pshard) { pshard->id = (uint8_t)*key - (uint8_t)0x80; return key + 1; } static void get_coll_range(const coll_t& cid, int bits, ghobject_t *temp_start, ghobject_t *temp_end, ghobject_t *start, ghobject_t *end, bool legacy) { spg_t pgid; constexpr uint32_t MAX_HASH = std::numeric_limits::max(); // use different nspaces due to we use different schemes when encoding // keys for listing objects const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff"; if (cid.is_pg(&pgid)) { start->shard_id = pgid.shard; *temp_start = *start; start->hobj.pool = pgid.pool(); temp_start->hobj.pool = -2ll - pgid.pool(); *end = *start; *temp_end = *temp_start; uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps()); start->hobj.set_bitwise_key_u32(reverse_hash); temp_start->hobj.set_bitwise_key_u32(reverse_hash); uint64_t end_hash = reverse_hash + (1ull << (32 - bits)); if (end_hash > MAX_HASH) { // make sure end hobj is even greater than the maximum possible hobj end->hobj.set_bitwise_key_u32(MAX_HASH); temp_end->hobj.set_bitwise_key_u32(MAX_HASH); end->hobj.nspace = MAX_NSPACE; } else { end->hobj.set_bitwise_key_u32(end_hash); temp_end->hobj.set_bitwise_key_u32(end_hash); } } else { start->shard_id = shard_id_t::NO_SHARD; start->hobj.pool = -1ull; *end = *start; start->hobj.set_bitwise_key_u32(0); end->hobj.set_bitwise_key_u32(MAX_HASH); end->hobj.nspace = MAX_NSPACE; // no separate temp section *temp_start = *end; *temp_end = *end; } start->generation = 0; end->generation = 0; temp_start->generation = 0; temp_end->generation = 0; } static void get_shared_blob_key(uint64_t sbid, string *key) { key->clear(); _key_encode_u64(sbid, key); } static int get_key_shared_blob(const string& key, uint64_t *sbid) { const char *p = key.c_str(); if (key.length() < sizeof(uint64_t)) return -1; _key_decode_u64(p, sbid); return 0; } template static void _key_encode_prefix(const ghobject_t& oid, S *key) { _key_encode_shard(oid.shard_id, key); _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); } static const char *_key_decode_prefix(const char *p, ghobject_t *oid) { p = _key_decode_shard(p, &oid->shard_id); uint64_t pool; p = _key_decode_u64(p, &pool); oid->hobj.pool = pool - 0x8000000000000000ull; unsigned hash; p = _key_decode_u32(p, &hash); oid->hobj.set_bitwise_key_u32(hash); return p; } #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4) template static int get_key_object(const S& key, ghobject_t *oid) { int r; const char *p = key.c_str(); if (key.length() < ENCODED_KEY_PREFIX_LEN) return -1; p = _key_decode_prefix(p, oid); if (key.length() == ENCODED_KEY_PREFIX_LEN) return -2; r = decode_escaped(p, &oid->hobj.nspace); if (r < 0) return -2; p += r + 1; string k; r = decode_escaped(p, &k); if (r < 0) return -3; p += r + 1; if (*p == '=') { // no key ++p; oid->hobj.oid.name = k; } else if (*p == '<' || *p == '>') { // key + name ++p; r = decode_escaped(p, &oid->hobj.oid.name); if (r < 0) return -5; p += r + 1; oid->hobj.set_key(k); } else { // malformed return -6; } p = _key_decode_u64(p, &oid->hobj.snap.val); p = _key_decode_u64(p, &oid->generation); if (*p != ONODE_KEY_SUFFIX) { return -7; } p++; if (*p) { // if we get something other than a null terminator here, // something goes wrong. return -8; } return 0; } template static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key) { key->clear(); size_t max_len = ENCODED_KEY_PREFIX_LEN + (oid.hobj.nspace.length() * 3 + 1) + (oid.hobj.get_key().length() * 3 + 1) + 1 + // for '<', '=', or '>' (oid.hobj.oid.name.length() * 3 + 1) + 8 + 8 + 1; key->reserve(max_len); _key_encode_prefix(oid, key); append_escaped(oid.hobj.nspace, key); if (oid.hobj.get_key().length()) { // is a key... could be < = or >. append_escaped(oid.hobj.get_key(), key); // (ASCII chars < = and > sort in that order, yay) int r = oid.hobj.get_key().compare(oid.hobj.oid.name); if (r) { key->append(r > 0 ? ">" : "<"); append_escaped(oid.hobj.oid.name, key); } else { // same as no key key->append("="); } } else { // no key append_escaped(oid.hobj.oid.name, key); key->append("="); } _key_encode_u64(oid.hobj.snap, key); _key_encode_u64(oid.generation, key); key->push_back(ONODE_KEY_SUFFIX); // sanity check if (true) { ghobject_t t; int r = get_key_object(*key, &t); if (r || t != oid) { derr << " r " << r << dendl; derr << "key " << pretty_binary_string(*key) << dendl; derr << "oid " << oid << dendl; derr << " t " << t << dendl; ceph_assert(r == 0 && t == oid); } } } // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing // char lets us quickly test whether it is a shard key without decoding any // of the prefix bytes. template static void get_extent_shard_key(const S& onode_key, uint32_t offset, string *key) { key->clear(); key->reserve(onode_key.length() + 4 + 1); key->append(onode_key.c_str(), onode_key.size()); _key_encode_u32(offset, key); key->push_back(EXTENT_SHARD_KEY_SUFFIX); } static void rewrite_extent_shard_key(uint32_t offset, string *key) { ceph_assert(key->size() > sizeof(uint32_t) + 1); ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX); _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key); } template static void generate_extent_shard_key_and_apply( const S& onode_key, uint32_t offset, string *key, std::function apply) { if (key->empty()) { // make full key ceph_assert(!onode_key.empty()); get_extent_shard_key(onode_key, offset, key); } else { rewrite_extent_shard_key(offset, key); } apply(*key); } int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset) { ceph_assert(key.size() > sizeof(uint32_t) + 1); ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX); int okey_len = key.size() - sizeof(uint32_t) - 1; *onode_key = key.substr(0, okey_len); const char *p = key.data() + okey_len; _key_decode_u32(p, offset); return 0; } static bool is_extent_shard_key(const string& key) { return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX; } static void get_deferred_key(uint64_t seq, string *out) { _key_encode_u64(seq, out); } static void get_pool_stat_key(int64_t pool_id, string *key) { key->clear(); _key_encode_u64(pool_id, key); } static int get_key_pool_stat(const string& key, uint64_t* pool_id) { const char *p = key.c_str(); if (key.length() < sizeof(uint64_t)) return -1; _key_decode_u64(p, pool_id); return 0; } template void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em) { uint64_t pos = 0; for (auto& s : em.shards) { dout(LogLevelV) << __func__ << " shard " << *s.shard_info << (s.loaded ? " (loaded)" : "") << (s.dirty ? " (dirty)" : "") << dendl; } for (auto& e : em.extent_map) { dout(LogLevelV) << __func__ << " " << e << dendl; ceph_assert(e.logical_offset >= pos); pos = e.logical_offset + e.length; const bluestore_blob_t& blob = e.blob->get_blob(); if (blob.has_csum()) { vector v; unsigned n = blob.get_csum_count(); for (unsigned i = 0; i < n; ++i) v.push_back(blob.get_csum_item(i)); dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec << dendl; } std::lock_guard l(e.blob->shared_blob->get_cache()->lock); for (auto& i : e.blob->shared_blob->bc.buffer_map) { dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first << "~" << i.second->length << std::dec << " " << *i.second << dendl; } } } template void _dump_onode(CephContext *cct, const BlueStore::Onode& o) { if (!cct->_conf->subsys.should_gather()) return; dout(LogLevelV) << __func__ << " " << &o << " " << o.oid << " nid " << o.onode.nid << " size 0x" << std::hex << o.onode.size << " (" << std::dec << o.onode.size << ")" << " expected_object_size " << o.onode.expected_object_size << " expected_write_size " << o.onode.expected_write_size << " in " << o.onode.extent_map_shards.size() << " shards" << ", " << o.extent_map.spanning_blob_map.size() << " spanning blobs" << dendl; for (auto p = o.onode.attrs.begin(); p != o.onode.attrs.end(); ++p) { dout(LogLevelV) << __func__ << " attr " << p->first << " len " << p->second.length() << dendl; } _dump_extent_map(cct, o.extent_map); } template void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t) { dout(LogLevelV) << __func__ << " transaction dump:\n"; JSONFormatter f(true); f.open_object_section("transaction"); t->dump(&f); f.close_section(); f.flush(*_dout); *_dout << dendl; } // Buffer ostream& operator<<(ostream& out, const BlueStore::Buffer& b) { out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex << b.offset << "~" << b.length << std::dec << " " << BlueStore::Buffer::get_state_name(b.state); if (b.flags) out << " " << BlueStore::Buffer::get_flag_name(b.flags); return out << ")"; } namespace { /* * Due to a bug in key string encoding (see a comment for append_escaped) * the KeyValueDB iterator does not lexicographically sort the same * way that ghobject_t does: objects with the same hash may have wrong order. * * This is the iterator wrapper that fixes the keys order. */ class CollectionListIterator { public: CollectionListIterator(const KeyValueDB::Iterator &it) : m_it(it) { } virtual ~CollectionListIterator() { } virtual bool valid() const = 0; virtual const ghobject_t &oid() const = 0; virtual void lower_bound(const ghobject_t &oid) = 0; virtual void upper_bound(const ghobject_t &oid) = 0; virtual void next() = 0; virtual int cmp(const ghobject_t &oid) const = 0; bool is_ge(const ghobject_t &oid) const { return cmp(oid) >= 0; } bool is_lt(const ghobject_t &oid) const { return cmp(oid) < 0; } protected: KeyValueDB::Iterator m_it; }; class SimpleCollectionListIterator : public CollectionListIterator { public: SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it) : CollectionListIterator(it), m_cct(cct) { } bool valid() const override { return m_it->valid(); } const ghobject_t &oid() const override { ceph_assert(valid()); return m_oid; } void lower_bound(const ghobject_t &oid) override { string key; get_object_key(m_cct, oid, &key); m_it->lower_bound(key); get_oid(); } void upper_bound(const ghobject_t &oid) override { string key; get_object_key(m_cct, oid, &key); m_it->upper_bound(key); get_oid(); } void next() override { ceph_assert(valid()); m_it->next(); get_oid(); } int cmp(const ghobject_t &oid) const override { ceph_assert(valid()); string key; get_object_key(m_cct, oid, &key); return m_it->key().compare(key); } private: CephContext *m_cct; ghobject_t m_oid; void get_oid() { m_oid = ghobject_t(); while (m_it->valid() && is_extent_shard_key(m_it->key())) { m_it->next(); } if (!valid()) { return; } int r = get_key_object(m_it->key(), &m_oid); ceph_assert(r == 0); } }; class SortedCollectionListIterator : public CollectionListIterator { public: SortedCollectionListIterator(const KeyValueDB::Iterator &it) : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) { } bool valid() const override { return m_chunk_iter != m_chunk.end(); } const ghobject_t &oid() const override { ceph_assert(valid()); return m_chunk_iter->first; } void lower_bound(const ghobject_t &oid) override { std::string key; _key_encode_prefix(oid, &key); m_it->lower_bound(key); m_chunk_iter = m_chunk.end(); if (!get_next_chunk()) { return; } if (this->oid().shard_id != oid.shard_id || this->oid().hobj.pool != oid.hobj.pool || this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { return; } m_chunk_iter = m_chunk.lower_bound(oid); if (m_chunk_iter == m_chunk.end()) { get_next_chunk(); } } void upper_bound(const ghobject_t &oid) override { lower_bound(oid); if (valid() && this->oid() == oid) { next(); } } void next() override { ceph_assert(valid()); m_chunk_iter++; if (m_chunk_iter == m_chunk.end()) { get_next_chunk(); } } int cmp(const ghobject_t &oid) const override { ceph_assert(valid()); if (this->oid() < oid) { return -1; } if (this->oid() > oid) { return 1; } return 0; } private: std::map m_chunk; std::map::iterator m_chunk_iter; bool get_next_chunk() { while (m_it->valid() && is_extent_shard_key(m_it->key())) { m_it->next(); } if (!m_it->valid()) { return false; } ghobject_t oid; int r = get_key_object(m_it->key(), &oid); ceph_assert(r == 0); m_chunk.clear(); while (true) { m_chunk.insert({oid, m_it->key()}); do { m_it->next(); } while (m_it->valid() && is_extent_shard_key(m_it->key())); if (!m_it->valid()) { break; } ghobject_t next; r = get_key_object(m_it->key(), &next); ceph_assert(r == 0); if (next.shard_id != oid.shard_id || next.hobj.pool != oid.hobj.pool || next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { break; } oid = next; } m_chunk_iter = m_chunk.begin(); return true; } }; } // anonymous namespace // Garbage Collector void BlueStore::GarbageCollector::process_protrusive_extents( const BlueStore::ExtentMap& extent_map, uint64_t start_offset, uint64_t end_offset, uint64_t start_touch_offset, uint64_t end_touch_offset, uint64_t min_alloc_size) { ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset); uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size); uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size); dout(30) << __func__ << " (hex): [" << std::hex << lookup_start_offset << ", " << lookup_end_offset << ")" << std::dec << dendl; for (auto it = extent_map.seek_lextent(lookup_start_offset); it != extent_map.extent_map.end() && it->logical_offset < lookup_end_offset; ++it) { uint64_t alloc_unit_start = it->logical_offset / min_alloc_size; uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size; dout(30) << __func__ << " " << *it << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end << dendl; Blob* b = it->blob.get(); if (it->logical_offset >=start_touch_offset && it->logical_end() <= end_touch_offset) { // Process extents within the range affected by // the current write request. // Need to take into account if existing extents // can be merged with them (uncompressed case) if (!b->get_blob().is_compressed()) { if (blob_info_counted && used_alloc_unit == alloc_unit_start) { --blob_info_counted->expected_allocations; // don't need to allocate // new AU for compressed // data since another // collocated uncompressed // blob already exists dout(30) << __func__ << " --expected:" << alloc_unit_start << dendl; } used_alloc_unit = alloc_unit_end; blob_info_counted = nullptr; } } else if (b->get_blob().is_compressed()) { // additionally we take compressed blobs that were not impacted // by the write into account too BlobInfo& bi = affected_blobs.emplace( b, BlobInfo(b->get_referenced_bytes())).first->second; int adjust = (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1; bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust; dout(30) << __func__ << " expected_allocations=" << bi.expected_allocations << " end_au:" << alloc_unit_end << dendl; blob_info_counted = &bi; used_alloc_unit = alloc_unit_end; ceph_assert(it->length <= bi.referenced_bytes); bi.referenced_bytes -= it->length; dout(30) << __func__ << " affected_blob:" << *b << " unref 0x" << std::hex << it->length << " referenced = 0x" << bi.referenced_bytes << std::dec << dendl; // NOTE: we can't move specific blob to resulting GC list here // when reference counter == 0 since subsequent extents might // decrement its expected_allocation. // Hence need to enumerate all the extents first. if (!bi.collect_candidate) { bi.first_lextent = it; bi.collect_candidate = true; } bi.last_lextent = it; } else { if (blob_info_counted && used_alloc_unit == alloc_unit_start) { // don't need to allocate new AU for compressed data since another // collocated uncompressed blob already exists --blob_info_counted->expected_allocations; dout(30) << __func__ << " --expected_allocations:" << alloc_unit_start << dendl; } used_alloc_unit = alloc_unit_end; blob_info_counted = nullptr; } } for (auto b_it = affected_blobs.begin(); b_it != affected_blobs.end(); ++b_it) { Blob* b = b_it->first; BlobInfo& bi = b_it->second; if (bi.referenced_bytes == 0) { uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length(); int64_t blob_expected_for_release = round_up_to(len_on_disk, min_alloc_size) / min_alloc_size; dout(30) << __func__ << " " << *(b_it->first) << " expected4release=" << blob_expected_for_release << " expected_allocations=" << bi.expected_allocations << dendl; int64_t benefit = blob_expected_for_release - bi.expected_allocations; if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) { if (bi.collect_candidate) { auto it = bi.first_lextent; bool bExit = false; do { if (it->blob.get() == b) { extents_to_collect.insert(it->logical_offset, it->length); } bExit = it == bi.last_lextent; ++it; } while (!bExit); } expected_for_release += blob_expected_for_release; expected_allocations += bi.expected_allocations; } } } } int64_t BlueStore::GarbageCollector::estimate( uint64_t start_offset, uint64_t length, const BlueStore::ExtentMap& extent_map, const BlueStore::old_extent_map_t& old_extents, uint64_t min_alloc_size) { affected_blobs.clear(); extents_to_collect.clear(); used_alloc_unit = boost::optional(); blob_info_counted = nullptr; uint64_t gc_start_offset = start_offset; uint64_t gc_end_offset = start_offset + length; uint64_t end_offset = start_offset + length; for (auto it = old_extents.begin(); it != old_extents.end(); ++it) { Blob* b = it->e.blob.get(); if (b->get_blob().is_compressed()) { // update gc_start_offset/gc_end_offset if needed gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start()); gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end()); auto o = it->e.logical_offset; auto l = it->e.length; uint64_t ref_bytes = b->get_referenced_bytes(); // micro optimization to bypass blobs that have no more references if (ref_bytes != 0) { dout(30) << __func__ << " affected_blob:" << *b << " unref 0x" << std::hex << o << "~" << l << std::dec << dendl; affected_blobs.emplace(b, BlobInfo(ref_bytes)); } } } dout(30) << __func__ << " gc range(hex): [" << std::hex << gc_start_offset << ", " << gc_end_offset << ")" << std::dec << dendl; // enumerate preceeding extents to check if they reference affected blobs if (gc_start_offset < start_offset || gc_end_offset > end_offset) { process_protrusive_extents(extent_map, gc_start_offset, gc_end_offset, start_offset, end_offset, min_alloc_size); } return expected_for_release - expected_allocations; } // LruOnodeCacheShard struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { typedef boost::intrusive::list< BlueStore::Onode, boost::intrusive::member_hook< BlueStore::Onode, boost::intrusive::list_member_hook<>, &BlueStore::Onode::lru_item> > list_t; list_t lru; explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {} void _add(BlueStore::Onode* o, int level) override { if (o->put_cache()) { (level > 0) ? lru.push_front(*o) : lru.push_back(*o); } else { ++num_pinned; } ++num; // we count both pinned and unpinned entries dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl; } void _rm(BlueStore::Onode* o) override { if (o->pop_cache()) { lru.erase(lru.iterator_to(*o)); } else { ceph_assert(num_pinned); --num_pinned; } ceph_assert(num); --num; dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl; } void _pin(BlueStore::Onode* o) override { lru.erase(lru.iterator_to(*o)); ++num_pinned; dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl; } void _unpin(BlueStore::Onode* o) override { lru.push_front(*o); ceph_assert(num_pinned); --num_pinned; dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl; } void _unpin_and_rm(BlueStore::Onode* o) override { o->pop_cache(); ceph_assert(num_pinned); --num_pinned; ceph_assert(num); --num; } void _trim_to(uint64_t new_size) override { if (new_size >= lru.size()) { return; // don't even try } uint64_t n = lru.size() - new_size; auto p = lru.end(); ceph_assert(p != lru.begin()); --p; ceph_assert(num >= n); num -= n; while (n-- > 0) { BlueStore::Onode *o = &*p; dout(20) << __func__ << " rm " << o->oid << " " << o->nref << " " << o->cached << " " << o->pinned << dendl; if (p != lru.begin()) { lru.erase(p--); } else { ceph_assert(n == 0); lru.erase(p); } auto pinned = !o->pop_cache(); ceph_assert(!pinned); o->c->onode_map._remove(o->oid); } } void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override { if (to == this) { return; } ceph_assert(o->cached); ceph_assert(o->pinned); ceph_assert(num); ceph_assert(num_pinned); --num_pinned; --num; ++to->num_pinned; ++to->num; } void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override { *onodes += num; *pinned_onodes += num_pinned; } }; // OnodeCacheShard BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create( CephContext* cct, string type, PerfCounters *logger) { BlueStore::OnodeCacheShard *c = nullptr; // Currently we only implement an LRU cache for onodes c = new LruOnodeCacheShard(cct); c->logger = logger; return c; } // LruBufferCacheShard struct LruBufferCacheShard : public BlueStore::BufferCacheShard { typedef boost::intrusive::list< BlueStore::Buffer, boost::intrusive::member_hook< BlueStore::Buffer, boost::intrusive::list_member_hook<>, &BlueStore::Buffer::lru_item> > list_t; list_t lru; explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {} void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override { if (near) { auto q = lru.iterator_to(*near); lru.insert(q, *b); } else if (level > 0) { lru.push_front(*b); } else { lru.push_back(*b); } buffer_bytes += b->length; num = lru.size(); } void _rm(BlueStore::Buffer *b) override { ceph_assert(buffer_bytes >= b->length); buffer_bytes -= b->length; auto q = lru.iterator_to(*b); lru.erase(q); num = lru.size(); } void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override { src->_rm(b); _add(b, 0, nullptr); } void _adjust_size(BlueStore::Buffer *b, int64_t delta) override { ceph_assert((int64_t)buffer_bytes + delta >= 0); buffer_bytes += delta; } void _touch(BlueStore::Buffer *b) override { auto p = lru.iterator_to(*b); lru.erase(p); lru.push_front(*b); num = lru.size(); _audit("_touch_buffer end"); } void _trim_to(uint64_t max) override { while (buffer_bytes > max) { auto i = lru.rbegin(); if (i == lru.rend()) { // stop if lru is now empty break; } BlueStore::Buffer *b = &*i; ceph_assert(b->is_clean()); dout(20) << __func__ << " rm " << *b << dendl; b->space->_rm_buffer(this, b); } num = lru.size(); } void add_stats(uint64_t *extents, uint64_t *blobs, uint64_t *buffers, uint64_t *bytes) override { *extents += num_extents; *blobs += num_blobs; *buffers += num; *bytes += buffer_bytes; } #ifdef DEBUG_CACHE void _audit(const char *s) override { dout(10) << __func__ << " " << when << " start" << dendl; uint64_t s = 0; for (auto i = lru.begin(); i != lru.end(); ++i) { s += i->length; } if (s != buffer_bytes) { derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s << dendl; for (auto i = lru.begin(); i != lru.end(); ++i) { derr << __func__ << " " << *i << dendl; } ceph_assert(s == buffer_bytes); } dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes << " ok" << dendl; } #endif }; // TwoQBufferCacheShard struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard { typedef boost::intrusive::list< BlueStore::Buffer, boost::intrusive::member_hook< BlueStore::Buffer, boost::intrusive::list_member_hook<>, &BlueStore::Buffer::lru_item> > list_t; list_t hot; ///< "Am" hot buffers list_t warm_in; ///< "A1in" newly warm buffers list_t warm_out; ///< "A1out" empty buffers we've evicted enum { BUFFER_NEW = 0, BUFFER_WARM_IN, ///< in warm_in BUFFER_WARM_OUT, ///< in warm_out BUFFER_HOT, ///< in hot BUFFER_TYPE_MAX }; uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type public: explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {} void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override { dout(20) << __func__ << " level " << level << " near " << near << " on " << *b << " which has cache_private " << b->cache_private << dendl; if (near) { b->cache_private = near->cache_private; switch (b->cache_private) { case BUFFER_WARM_IN: warm_in.insert(warm_in.iterator_to(*near), *b); break; case BUFFER_WARM_OUT: ceph_assert(b->is_empty()); warm_out.insert(warm_out.iterator_to(*near), *b); break; case BUFFER_HOT: hot.insert(hot.iterator_to(*near), *b); break; default: ceph_abort_msg("bad cache_private"); } } else if (b->cache_private == BUFFER_NEW) { b->cache_private = BUFFER_WARM_IN; if (level > 0) { warm_in.push_front(*b); } else { // take caller hint to start at the back of the warm queue warm_in.push_back(*b); } } else { // we got a hint from discard switch (b->cache_private) { case BUFFER_WARM_IN: // stay in warm_in. move to front, even though 2Q doesn't actually // do this. dout(20) << __func__ << " move to front of warm " << *b << dendl; warm_in.push_front(*b); break; case BUFFER_WARM_OUT: b->cache_private = BUFFER_HOT; // move to hot. fall-thru case BUFFER_HOT: dout(20) << __func__ << " move to front of hot " << *b << dendl; hot.push_front(*b); break; default: ceph_abort_msg("bad cache_private"); } } if (!b->is_empty()) { buffer_bytes += b->length; list_bytes[b->cache_private] += b->length; } num = hot.size() + warm_in.size(); } void _rm(BlueStore::Buffer *b) override { dout(20) << __func__ << " " << *b << dendl; if (!b->is_empty()) { ceph_assert(buffer_bytes >= b->length); buffer_bytes -= b->length; ceph_assert(list_bytes[b->cache_private] >= b->length); list_bytes[b->cache_private] -= b->length; } switch (b->cache_private) { case BUFFER_WARM_IN: warm_in.erase(warm_in.iterator_to(*b)); break; case BUFFER_WARM_OUT: warm_out.erase(warm_out.iterator_to(*b)); break; case BUFFER_HOT: hot.erase(hot.iterator_to(*b)); break; default: ceph_abort_msg("bad cache_private"); } num = hot.size() + warm_in.size(); } void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override { TwoQBufferCacheShard *src = static_cast(srcc); src->_rm(b); // preserve which list we're on (even if we can't preserve the order!) switch (b->cache_private) { case BUFFER_WARM_IN: ceph_assert(!b->is_empty()); warm_in.push_back(*b); break; case BUFFER_WARM_OUT: ceph_assert(b->is_empty()); warm_out.push_back(*b); break; case BUFFER_HOT: ceph_assert(!b->is_empty()); hot.push_back(*b); break; default: ceph_abort_msg("bad cache_private"); } if (!b->is_empty()) { buffer_bytes += b->length; list_bytes[b->cache_private] += b->length; } num = hot.size() + warm_in.size(); } void _adjust_size(BlueStore::Buffer *b, int64_t delta) override { dout(20) << __func__ << " delta " << delta << " on " << *b << dendl; if (!b->is_empty()) { ceph_assert((int64_t)buffer_bytes + delta >= 0); buffer_bytes += delta; ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0); list_bytes[b->cache_private] += delta; } } void _touch(BlueStore::Buffer *b) override { switch (b->cache_private) { case BUFFER_WARM_IN: // do nothing (somewhat counter-intuitively!) break; case BUFFER_WARM_OUT: // move from warm_out to hot LRU ceph_abort_msg("this happens via discard hint"); break; case BUFFER_HOT: // move to front of hot LRU hot.erase(hot.iterator_to(*b)); hot.push_front(*b); break; } num = hot.size() + warm_in.size(); _audit("_touch_buffer end"); } void _trim_to(uint64_t max) override { if (buffer_bytes > max) { uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio; uint64_t khot = max - kin; // pre-calculate kout based on average buffer size too, // which is typical(the warm_in and hot lists may change later) uint64_t kout = 0; uint64_t buffer_num = hot.size() + warm_in.size(); if (buffer_num) { uint64_t avg_size = buffer_bytes / buffer_num; ceph_assert(avg_size); uint64_t calculated_num = max / avg_size; kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio; } if (list_bytes[BUFFER_HOT] < khot) { // hot is small, give slack to warm_in kin += khot - list_bytes[BUFFER_HOT]; } else if (list_bytes[BUFFER_WARM_IN] < kin) { // warm_in is small, give slack to hot khot += kin - list_bytes[BUFFER_WARM_IN]; } // adjust warm_in list int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin; uint64_t evicted = 0; while (to_evict_bytes > 0) { auto p = warm_in.rbegin(); if (p == warm_in.rend()) { // stop if warm_in list is now empty break; } BlueStore::Buffer *b = &*p; ceph_assert(b->is_clean()); dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl; ceph_assert(buffer_bytes >= b->length); buffer_bytes -= b->length; ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length); list_bytes[BUFFER_WARM_IN] -= b->length; to_evict_bytes -= b->length; evicted += b->length; b->state = BlueStore::Buffer::STATE_EMPTY; b->data.clear(); warm_in.erase(warm_in.iterator_to(*b)); warm_out.push_front(*b); b->cache_private = BUFFER_WARM_OUT; } if (evicted > 0) { dout(20) << __func__ << " evicted " << byte_u_t(evicted) << " from warm_in list, done evicting warm_in buffers" << dendl; } // adjust hot list to_evict_bytes = list_bytes[BUFFER_HOT] - khot; evicted = 0; while (to_evict_bytes > 0) { auto p = hot.rbegin(); if (p == hot.rend()) { // stop if hot list is now empty break; } BlueStore::Buffer *b = &*p; dout(20) << __func__ << " buffer_hot rm " << *b << dendl; ceph_assert(b->is_clean()); // adjust evict size before buffer goes invalid to_evict_bytes -= b->length; evicted += b->length; b->space->_rm_buffer(this, b); } if (evicted > 0) { dout(20) << __func__ << " evicted " << byte_u_t(evicted) << " from hot list, done evicting hot buffers" << dendl; } // adjust warm out list too, if necessary int64_t n = warm_out.size() - kout; while (n-- > 0) { BlueStore::Buffer *b = &*warm_out.rbegin(); ceph_assert(b->is_empty()); dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl; b->space->_rm_buffer(this, b); } } num = hot.size() + warm_in.size(); } void add_stats(uint64_t *extents, uint64_t *blobs, uint64_t *buffers, uint64_t *bytes) override { *extents += num_extents; *blobs += num_blobs; *buffers += num; *bytes += buffer_bytes; } #ifdef DEBUG_CACHE void _audit(const char *s) override { dout(10) << __func__ << " " << when << " start" << dendl; uint64_t s = 0; for (auto i = hot.begin(); i != hot.end(); ++i) { s += i->length; } uint64_t hot_bytes = s; if (hot_bytes != list_bytes[BUFFER_HOT]) { derr << __func__ << " hot_list_bytes " << list_bytes[BUFFER_HOT] << " != actual " << hot_bytes << dendl; ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]); } for (auto i = warm_in.begin(); i != warm_in.end(); ++i) { s += i->length; } uint64_t warm_in_bytes = s - hot_bytes; if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) { derr << __func__ << " warm_in_list_bytes " << list_bytes[BUFFER_WARM_IN] << " != actual " << warm_in_bytes << dendl; ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]); } if (s != buffer_bytes) { derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s << dendl; ceph_assert(s == buffer_bytes); } dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes << " ok" << dendl; } #endif }; // BuferCacheShard BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create( CephContext* cct, string type, PerfCounters *logger) { BufferCacheShard *c = nullptr; if (type == "lru") c = new LruBufferCacheShard(cct); else if (type == "2q") c = new TwoQBufferCacheShard(cct); else ceph_abort_msg("unrecognized cache type"); c->logger = logger; return c; } // BufferSpace #undef dout_prefix #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") " void BlueStore::BufferSpace::_clear(BufferCacheShard* cache) { // note: we already hold cache->lock ldout(cache->cct, 20) << __func__ << dendl; while (!buffer_map.empty()) { _rm_buffer(cache, buffer_map.begin()); } } int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) { // note: we already hold cache->lock ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length << std::dec << dendl; int cache_private = 0; cache->_audit("discard start"); auto i = _data_lower_bound(offset); uint32_t end = offset + length; while (i != buffer_map.end()) { Buffer *b = i->second.get(); if (b->offset >= end) { break; } if (b->cache_private > cache_private) { cache_private = b->cache_private; } if (b->offset < offset) { int64_t front = offset - b->offset; if (b->end() > end) { // drop middle (split) uint32_t tail = b->end() - end; if (b->data.length()) { bufferlist bl; bl.substr_of(b->data, b->length - tail, tail); Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags); nb->maybe_rebuild(); _add_buffer(cache, nb, 0, b); } else { _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail, b->flags), 0, b); } if (!b->is_writing()) { cache->_adjust_size(b, front - (int64_t)b->length); } b->truncate(front); b->maybe_rebuild(); cache->_audit("discard end 1"); break; } else { // drop tail if (!b->is_writing()) { cache->_adjust_size(b, front - (int64_t)b->length); } b->truncate(front); b->maybe_rebuild(); ++i; continue; } } if (b->end() <= end) { // drop entire buffer _rm_buffer(cache, i++); continue; } // drop front uint32_t keep = b->end() - end; if (b->data.length()) { bufferlist bl; bl.substr_of(b->data, b->length - keep, keep); Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags); nb->maybe_rebuild(); _add_buffer(cache, nb, 0, b); } else { _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep, b->flags), 0, b); } _rm_buffer(cache, i); cache->_audit("discard end 2"); break; } return cache_private; } void BlueStore::BufferSpace::read( BufferCacheShard* cache, uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, interval_set& res_intervals, int flags) { res.clear(); res_intervals.clear(); uint32_t want_bytes = length; uint32_t end = offset + length; { std::lock_guard l(cache->lock); for (auto i = _data_lower_bound(offset); i != buffer_map.end() && offset < end && i->first < end; ++i) { Buffer *b = i->second.get(); ceph_assert(b->end() > offset); bool val = false; if (flags & BYPASS_CLEAN_CACHE) val = b->is_writing(); else val = b->is_writing() || b->is_clean(); if (val) { if (b->offset < offset) { uint32_t skip = offset - b->offset; uint32_t l = min(length, b->length - skip); res[offset].substr_of(b->data, skip, l); res_intervals.insert(offset, l); offset += l; length -= l; if (!b->is_writing()) { cache->_touch(b); } continue; } if (b->offset > offset) { uint32_t gap = b->offset - offset; if (length <= gap) { break; } offset += gap; length -= gap; } if (!b->is_writing()) { cache->_touch(b); } if (b->length > length) { res[offset].substr_of(b->data, 0, length); res_intervals.insert(offset, length); break; } else { res[offset].append(b->data); res_intervals.insert(offset, b->length); if (b->length == length) break; offset += b->length; length -= b->length; } } } } uint64_t hit_bytes = res_intervals.size(); ceph_assert(hit_bytes <= want_bytes); uint64_t miss_bytes = want_bytes - hit_bytes; cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes); cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes); } void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq) { auto i = writing.begin(); while (i != writing.end()) { if (i->seq > seq) { break; } if (i->seq < seq) { ++i; continue; } Buffer *b = &*i; ceph_assert(b->is_writing()); if (b->flags & Buffer::FLAG_NOCACHE) { writing.erase(i++); ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl; buffer_map.erase(b->offset); } else { b->state = Buffer::STATE_CLEAN; writing.erase(i++); b->maybe_rebuild(); b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); cache->_add(b, 1, nullptr); ldout(cache->cct, 20) << __func__ << " added " << *b << dendl; } } cache->_trim(); cache->_audit("finish_write end"); } void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r) { std::lock_guard lk(cache->lock); if (buffer_map.empty()) return; auto p = --buffer_map.end(); while (true) { if (p->second->end() <= pos) break; if (p->second->offset < pos) { ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl; size_t left = pos - p->second->offset; size_t right = p->second->length - left; if (p->second->data.length()) { bufferlist bl; bl.substr_of(p->second->data, left, right); r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl, p->second->flags), 0, p->second.get()); } else { r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right, p->second->flags), 0, p->second.get()); } cache->_adjust_size(p->second.get(), -right); p->second->truncate(left); break; } ceph_assert(p->second->end() > pos); ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl; if (p->second->data.length()) { r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, p->second->offset - pos, p->second->data, p->second->flags), 0, p->second.get()); } else { r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, p->second->offset - pos, p->second->length, p->second->flags), 0, p->second.get()); } if (p == buffer_map.begin()) { _rm_buffer(cache, p); break; } else { _rm_buffer(cache, p--); } } ceph_assert(writing.empty()); cache->_trim(); } // OnodeSpace #undef dout_prefix #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") " BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef& o) { std::lock_guard l(cache->lock); auto p = onode_map.find(oid); if (p != onode_map.end()) { ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << " raced, returning existing " << p->second << dendl; return p->second; } ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl; onode_map[oid] = o; cache->_add(o.get(), 1); cache->_trim(); return o; } void BlueStore::OnodeSpace::_remove(const ghobject_t& oid) { ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl; onode_map.erase(oid); } BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) { ldout(cache->cct, 30) << __func__ << dendl; OnodeRef o; bool hit = false; { std::lock_guard l(cache->lock); ceph::unordered_map::iterator p = onode_map.find(oid); if (p == onode_map.end()) { ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl; } else { ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second << " " << p->second->nref << " " << p->second->cached << " " << p->second->pinned << dendl; // This will pin onode and implicitly touch the cache when Onode // eventually will become unpinned o = p->second; ceph_assert(!o->cached || o->pinned); hit = true; } } if (hit) { cache->logger->inc(l_bluestore_onode_hits); } else { cache->logger->inc(l_bluestore_onode_misses); } return o; } void BlueStore::OnodeSpace::clear() { std::lock_guard l(cache->lock); ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl; for (auto &p : onode_map) { cache->_rm(p.second.get()); } onode_map.clear(); } bool BlueStore::OnodeSpace::empty() { std::lock_guard l(cache->lock); return onode_map.empty(); } void BlueStore::OnodeSpace::rename( OnodeRef& oldo, const ghobject_t& old_oid, const ghobject_t& new_oid, const mempool::bluestore_cache_meta::string& new_okey) { std::lock_guard l(cache->lock); ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid << dendl; ceph::unordered_map::iterator po, pn; po = onode_map.find(old_oid); pn = onode_map.find(new_oid); ceph_assert(po != pn); ceph_assert(po != onode_map.end()); if (pn != onode_map.end()) { ldout(cache->cct, 30) << __func__ << " removing target " << pn->second << dendl; cache->_rm(pn->second.get()); onode_map.erase(pn); } OnodeRef o = po->second; // install a non-existent onode at old location oldo.reset(new Onode(o->c, old_oid, o->key)); po->second = oldo; cache->_add(oldo.get(), 1); // add at new position and fix oid, key. // This will pin 'o' and implicitly touch cache // when it will eventually become unpinned onode_map.insert(make_pair(new_oid, o)); ceph_assert(o->pinned); o->oid = new_oid; o->key = new_okey; cache->_trim(); } bool BlueStore::OnodeSpace::map_any(std::function f) { std::lock_guard l(cache->lock); ldout(cache->cct, 20) << __func__ << dendl; for (auto& i : onode_map) { if (f(i.second.get())) { return true; } } return false; } template void BlueStore::OnodeSpace::dump(CephContext *cct) { for (auto& i : onode_map) { ldout(cct, LogLevelV) << i.first << " : " << i.second << " " << i.second->nref << " " << i.second->cached << " " << i.second->pinned << dendl; } } // SharedBlob #undef dout_prefix #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") " #undef dout_context #define dout_context coll->store->cct void BlueStore::SharedBlob::dump(Formatter* f) const { f->dump_bool("loaded", loaded); if (loaded) { persistent->dump(f); } else { f->dump_unsigned("sbid_unloaded", sbid_unloaded); } } ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb) { out << "SharedBlob(" << &sb; if (sb.loaded) { out << " loaded " << *sb.persistent; } else { out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec; } return out << ")"; } BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll) : coll(_coll), sbid_unloaded(i) { ceph_assert(sbid_unloaded > 0); if (get_cache()) { get_cache()->add_blob(); } } BlueStore::SharedBlob::~SharedBlob() { if (loaded && persistent) { delete persistent; } } void BlueStore::SharedBlob::put() { if (--nref == 0) { dout(20) << __func__ << " " << this << " removing self from set " << get_parent() << dendl; again: auto coll_snap = coll; if (coll_snap) { std::lock_guard l(coll_snap->cache->lock); if (coll_snap != coll) { goto again; } if (!coll_snap->shared_blob_set.remove(this, true)) { // race with lookup return; } bc._clear(coll_snap->cache); coll_snap->cache->rm_blob(); } delete this; } } void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length) { ceph_assert(persistent); persistent->ref_map.get(offset, length); } void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length, PExtentVector *r, bool *unshare) { ceph_assert(persistent); persistent->ref_map.put(offset, length, r, unshare && !*unshare ? unshare : nullptr); } void BlueStore::SharedBlob::finish_write(uint64_t seq) { while (true) { BufferCacheShard *cache = coll->cache; std::lock_guard l(cache->lock); if (coll->cache != cache) { dout(20) << __func__ << " raced with sb cache update, was " << cache << ", now " << coll->cache << ", retrying" << dendl; continue; } bc._finish_write(cache, seq); break; } } // SharedBlobSet #undef dout_prefix #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") " template void BlueStore::SharedBlobSet::dump(CephContext *cct) { std::lock_guard l(lock); for (auto& i : sb_map) { ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl; } } // Blob #undef dout_prefix #define dout_prefix *_dout << "bluestore.blob(" << this << ") " void BlueStore::Blob::dump(Formatter* f) const { if (is_spanning()) { f->dump_unsigned("spanning_id ", id); } blob.dump(f); if (shared_blob) { f->dump_object("shared", *shared_blob); } } ostream& operator<<(ostream& out, const BlueStore::Blob& b) { out << "Blob(" << &b; if (b.is_spanning()) { out << " spanning " << b.id; } out << " " << b.get_blob() << " " << b.get_blob_use_tracker(); if (b.shared_blob) { out << " " << *b.shared_blob; } else { out << " (shared_blob=NULL)"; } out << ")"; return out; } void BlueStore::Blob::discard_unallocated(Collection *coll) { if (get_blob().is_shared()) { return; } if (get_blob().is_compressed()) { bool discard = false; bool all_invalid = true; for (auto e : get_blob().get_extents()) { if (!e.is_valid()) { discard = true; } else { all_invalid = false; } } ceph_assert(discard == all_invalid); // in case of compressed blob all // or none pextents are invalid. if (discard) { shared_blob->bc.discard(shared_blob->get_cache(), 0, get_blob().get_logical_length()); } } else { size_t pos = 0; for (auto e : get_blob().get_extents()) { if (!e.is_valid()) { dout(20) << __func__ << " 0x" << std::hex << pos << "~" << e.length << std::dec << dendl; shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length); } pos += e.length; } if (get_blob().can_prune_tail()) { dirty_blob().prune_tail(); used_in_blob.prune_tail(get_blob().get_ondisk_length()); dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl; } } } void BlueStore::Blob::get_ref( Collection *coll, uint32_t offset, uint32_t length) { // Caller has to initialize Blob's logical length prior to increment // references. Otherwise one is neither unable to determine required // amount of counters in case of per-au tracking nor obtain min_release_size // for single counter mode. ceph_assert(get_blob().get_logical_length() != 0); dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << " " << *this << dendl; if (used_in_blob.is_empty()) { uint32_t min_release_size = get_blob().get_release_size(coll->store->min_alloc_size); uint64_t l = get_blob().get_logical_length(); dout(20) << __func__ << " init 0x" << std::hex << l << ", " << min_release_size << std::dec << dendl; used_in_blob.init(l, min_release_size); } used_in_blob.get( offset, length); } bool BlueStore::Blob::put_ref( Collection *coll, uint32_t offset, uint32_t length, PExtentVector *r) { PExtentVector logical; dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << " " << *this << dendl; bool empty = used_in_blob.put( offset, length, &logical); r->clear(); // nothing to release if (!empty && logical.empty()) { return false; } bluestore_blob_t& b = dirty_blob(); return b.release_extents(empty, logical, r); } bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size, uint32_t target_blob_size, uint32_t b_offset, uint32_t *length0) { ceph_assert(min_alloc_size); ceph_assert(target_blob_size); if (!get_blob().is_mutable()) { return false; } uint32_t length = *length0; uint32_t end = b_offset + length; // Currently for the sake of simplicity we omit blob reuse if data is // unaligned with csum chunk. Later we can perform padding if needed. if (get_blob().has_csum() && ((b_offset % get_blob().get_csum_chunk_size()) != 0 || (end % get_blob().get_csum_chunk_size()) != 0)) { return false; } auto blen = get_blob().get_logical_length(); uint32_t new_blen = blen; // make sure target_blob_size isn't less than current blob len target_blob_size = std::max(blen, target_blob_size); if (b_offset >= blen) { // new data totally stands out of the existing blob new_blen = end; } else { // new data overlaps with the existing blob new_blen = std::max(blen, end); uint32_t overlap = 0; if (new_blen > blen) { overlap = blen - b_offset; } else { overlap = length; } if (!get_blob().is_unallocated(b_offset, overlap)) { // abort if any piece of the overlap has already been allocated return false; } } if (new_blen > blen) { int64_t overflow = int64_t(new_blen) - target_blob_size; // Unable to decrease the provided length to fit into max_blob_size if (overflow >= length) { return false; } // FIXME: in some cases we could reduce unused resolution if (get_blob().has_unused()) { return false; } if (overflow > 0) { new_blen -= overflow; length -= overflow; *length0 = length; } if (new_blen > blen) { dirty_blob().add_tail(new_blen); used_in_blob.add_tail(new_blen, get_blob().get_release_size(min_alloc_size)); } } return true; } void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r) { dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec << " start " << *this << dendl; ceph_assert(blob.can_split()); ceph_assert(used_in_blob.can_split()); bluestore_blob_t &lb = dirty_blob(); bluestore_blob_t &rb = r->dirty_blob(); used_in_blob.split( blob_offset, &(r->used_in_blob)); lb.split(blob_offset, rb); shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc); dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec << " finish " << *this << dendl; dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec << " and " << *r << dendl; } #ifndef CACHE_BLOB_BL void BlueStore::Blob::decode( Collection *coll, bufferptr::const_iterator& p, uint64_t struct_v, uint64_t* sbid, bool include_ref_map) { denc(blob, p, struct_v); if (blob.is_shared()) { denc(*sbid, p); } if (include_ref_map) { if (struct_v > 1) { used_in_blob.decode(p); } else { used_in_blob.clear(); bluestore_extent_ref_map_t legacy_ref_map; legacy_ref_map.decode(p); for (auto r : legacy_ref_map.ref_map) { get_ref( coll, r.first, r.second.refs * r.second.length); } } } } #endif // Extent void BlueStore::Extent::dump(Formatter* f) const { f->dump_unsigned("logical_offset", logical_offset); f->dump_unsigned("length", length); f->dump_unsigned("blob_offset", blob_offset); f->dump_object("blob", *blob); } ostream& operator<<(ostream& out, const BlueStore::Extent& e) { return out << std::hex << "0x" << e.logical_offset << "~" << e.length << ": 0x" << e.blob_offset << "~" << e.length << std::dec << " " << *e.blob; } // OldExtent BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c, uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) { OldExtent* oe = new OldExtent(lo, o, l, b); b->put_ref(c.get(), o, l, &(oe->r)); oe->blob_empty = !b->is_referenced(); return oe; } // ExtentMap #undef dout_prefix #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") " #undef dout_context #define dout_context onode->c->store->cct BlueStore::ExtentMap::ExtentMap(Onode *o) : onode(o), inline_bl( o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) { } void BlueStore::ExtentMap::dump(Formatter* f) const { f->open_array_section("extents"); for (auto& e : extent_map) { f->dump_object("extent", e); } f->close_section(); } void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc, CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff, uint64_t& length, uint64_t& dstoff) { auto cct = onode->c->store->cct; bool inject_21040 = cct->_conf->bluestore_debug_inject_bug21040; vector id_to_blob(oldo->extent_map.extent_map.size()); for (auto& e : oldo->extent_map.extent_map) { e.blob->last_encoded_id = -1; } int n = 0; uint64_t end = srcoff + length; uint32_t dirty_range_begin = 0; uint32_t dirty_range_end = 0; bool src_dirty = false; for (auto ep = oldo->extent_map.seek_lextent(srcoff); ep != oldo->extent_map.extent_map.end(); ++ep) { auto& e = *ep; if (e.logical_offset >= end) { break; } dout(20) << __func__ << " src " << e << dendl; BlobRef cb; bool blob_duped = true; if (e.blob->last_encoded_id >= 0) { cb = id_to_blob[e.blob->last_encoded_id]; blob_duped = false; } else { // dup the blob const bluestore_blob_t& blob = e.blob->get_blob(); // make sure it is shared if (!blob.is_shared()) { c->make_blob_shared(b->_assign_blobid(txc), e.blob); if (!inject_21040 && !src_dirty) { src_dirty = true; dirty_range_begin = e.logical_offset; } else if (inject_21040 && dirty_range_begin == 0 && dirty_range_end == 0) { dirty_range_begin = e.logical_offset; } ceph_assert(e.logical_end() > 0); // -1 to exclude next potential shard dirty_range_end = e.logical_end() - 1; } else { c->load_shared_blob(e.blob->shared_blob); } cb = new Blob(); e.blob->last_encoded_id = n; id_to_blob[n] = cb; e.blob->dup(*cb); // bump the extent refs on the copied blob's extents for (auto p : blob.get_extents()) { if (p.is_valid()) { e.blob->shared_blob->get_ref(p.offset, p.length); } } txc->write_shared_blob(e.blob->shared_blob); dout(20) << __func__ << " new " << *cb << dendl; } int skip_front, skip_back; if (e.logical_offset < srcoff) { skip_front = srcoff - e.logical_offset; } else { skip_front = 0; } if (e.logical_end() > end) { skip_back = e.logical_end() - end; } else { skip_back = 0; } Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff, e.blob_offset + skip_front, e.length - skip_front - skip_back, cb); newo->extent_map.extent_map.insert(*ne); ne->blob->get_ref(c.get(), ne->blob_offset, ne->length); // fixme: we may leave parts of new blob unreferenced that could // be freed (relative to the shared_blob). txc->statfs_delta.stored() += ne->length; if (e.blob->get_blob().is_compressed()) { txc->statfs_delta.compressed_original() += ne->length; if (blob_duped) { txc->statfs_delta.compressed() += cb->get_blob().get_compressed_payload_length(); } } dout(20) << __func__ << " dst " << *ne << dendl; ++n; } if ((!inject_21040 && src_dirty) || (inject_21040 && dirty_range_end > dirty_range_begin)) { oldo->extent_map.dirty_range(dirty_range_begin, dirty_range_end - dirty_range_begin); txc->write_onode(oldo); } txc->write_onode(newo); if (dstoff + length > newo->onode.size) { newo->onode.size = dstoff + length; } newo->extent_map.dirty_range(dstoff, length); } void BlueStore::ExtentMap::update(KeyValueDB::Transaction t, bool force) { auto cct = onode->c->store->cct; //used by dout dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl; if (onode->onode.extent_map_shards.empty()) { if (inline_bl.length() == 0) { unsigned n; // we need to encode inline_bl to measure encoded length bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n); inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl); ceph_assert(!never_happen); size_t len = inline_bl.length(); dout(20) << __func__ << " inline shard " << len << " bytes from " << n << " extents" << dendl; if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) { request_reshard(0, OBJECT_MAX_SIZE); return; } } // will persist in the onode key. } else { // pending shard update struct dirty_shard_t { Shard *shard; bufferlist bl; dirty_shard_t(Shard *s) : shard(s) {} }; vector encoded_shards; // allocate slots for all shards in a single call instead of // doing multiple allocations - one per each dirty shard encoded_shards.reserve(shards.size()); auto p = shards.begin(); auto prev_p = p; while (p != shards.end()) { ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset); auto n = p; ++n; if (p->dirty) { uint32_t endoff; if (n == shards.end()) { endoff = OBJECT_MAX_SIZE; } else { endoff = n->shard_info->offset; } encoded_shards.emplace_back(dirty_shard_t(&(*p))); bufferlist& bl = encoded_shards.back().bl; if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset, bl, &p->extents)) { if (force) { derr << __func__ << " encode_some needs reshard" << dendl; ceph_assert(!force); } } size_t len = bl.length(); dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset << std::dec << " is " << len << " bytes (was " << p->shard_info->bytes << ") from " << p->extents << " extents" << dendl; if (!force) { if (len > cct->_conf->bluestore_extent_map_shard_max_size) { // we are big; reshard ourselves request_reshard(p->shard_info->offset, endoff); } // avoid resharding the trailing shard, even if it is small else if (n != shards.end() && len < g_conf()->bluestore_extent_map_shard_min_size) { ceph_assert(endoff != OBJECT_MAX_SIZE); if (p == shards.begin()) { // we are the first shard, combine with next shard request_reshard(p->shard_info->offset, endoff + 1); } else { // combine either with the previous shard or the next, // whichever is smaller if (prev_p->shard_info->bytes > n->shard_info->bytes) { request_reshard(p->shard_info->offset, endoff + 1); } else { request_reshard(prev_p->shard_info->offset, endoff); } } } } } prev_p = p; p = n; } if (needs_reshard()) { return; } // schedule DB update for dirty shards string key; for (auto& it : encoded_shards) { it.shard->dirty = false; it.shard->shard_info->bytes = it.bl.length(); generate_extent_shard_key_and_apply( onode->key, it.shard->shard_info->offset, &key, [&](const string& final_key) { t->set(PREFIX_OBJ, final_key, it.bl); } ); } } } bid_t BlueStore::ExtentMap::allocate_spanning_blob_id() { if (spanning_blob_map.empty()) return 0; bid_t bid = spanning_blob_map.rbegin()->first + 1; // bid is valid and available. if (bid >= 0) return bid; // Find next unused bid; bid = rand() % (numeric_limits::max() + 1); const auto begin_bid = bid; do { if (!spanning_blob_map.count(bid)) return bid; else { bid++; if (bid < 0) bid = 0; } } while (bid != begin_bid); auto cct = onode->c->store->cct; // used by dout _dump_onode<0>(cct, *onode); ceph_abort_msg("no available blob id"); } void BlueStore::ExtentMap::reshard( KeyValueDB *db, KeyValueDB::Transaction t) { auto cct = onode->c->store->cct; // used by dout dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << "," << needs_reshard_end << ")" << std::dec << " of " << onode->onode.extent_map_shards.size() << " shards on " << onode->oid << dendl; for (auto& p : spanning_blob_map) { dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second << dendl; } // determine shard index range unsigned si_begin = 0, si_end = 0; if (!shards.empty()) { while (si_begin + 1 < shards.size() && shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) { ++si_begin; } needs_reshard_begin = shards[si_begin].shard_info->offset; for (si_end = si_begin; si_end < shards.size(); ++si_end) { if (shards[si_end].shard_info->offset >= needs_reshard_end) { needs_reshard_end = shards[si_end].shard_info->offset; break; } } if (si_end == shards.size()) { needs_reshard_end = OBJECT_MAX_SIZE; } dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")" << " over 0x[" << std::hex << needs_reshard_begin << "," << needs_reshard_end << ")" << std::dec << dendl; } fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin)); // we may need to fault in a larger interval later must have all // referring extents for spanning blobs loaded in order to have // accurate use_tracker values. uint32_t spanning_scan_begin = needs_reshard_begin; uint32_t spanning_scan_end = needs_reshard_end; // remove old keys string key; for (unsigned i = si_begin; i < si_end; ++i) { generate_extent_shard_key_and_apply( onode->key, shards[i].shard_info->offset, &key, [&](const string& final_key) { t->rmkey(PREFIX_OBJ, final_key); } ); } // calculate average extent size unsigned bytes = 0; unsigned extents = 0; if (onode->onode.extent_map_shards.empty()) { bytes = inline_bl.length(); extents = extent_map.size(); } else { for (unsigned i = si_begin; i < si_end; ++i) { bytes += shards[i].shard_info->bytes; extents += shards[i].extents; } } unsigned target = cct->_conf->bluestore_extent_map_shard_target_size; unsigned slop = target * cct->_conf->bluestore_extent_map_shard_target_size_slop; unsigned extent_avg = bytes / std::max(1u, extents); dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target << ", slop " << slop << dendl; // reshard unsigned estimate = 0; unsigned offset = needs_reshard_begin; vector new_shard_info; unsigned max_blob_end = 0; Extent dummy(needs_reshard_begin); for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) { if (e->logical_offset >= needs_reshard_end) { break; } dout(30) << " extent " << *e << dendl; // disfavor shard boundaries that span a blob bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset; if (estimate && estimate + extent_avg > target + (would_span ? slop : 0)) { // new shard if (offset == needs_reshard_begin) { new_shard_info.emplace_back(bluestore_onode_t::shard_info()); new_shard_info.back().offset = offset; dout(20) << __func__ << " new shard 0x" << std::hex << offset << std::dec << dendl; } offset = e->logical_offset; new_shard_info.emplace_back(bluestore_onode_t::shard_info()); new_shard_info.back().offset = offset; dout(20) << __func__ << " new shard 0x" << std::hex << offset << std::dec << dendl; estimate = 0; } estimate += extent_avg; unsigned bs = e->blob_start(); if (bs < spanning_scan_begin) { spanning_scan_begin = bs; } uint32_t be = e->blob_end(); if (be > max_blob_end) { max_blob_end = be; } if (be > spanning_scan_end) { spanning_scan_end = be; } } if (new_shard_info.empty() && (si_begin > 0 || si_end < shards.size())) { // we resharded a partial range; we must produce at least one output // shard new_shard_info.emplace_back(bluestore_onode_t::shard_info()); new_shard_info.back().offset = needs_reshard_begin; dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin << std::dec << " (singleton degenerate case)" << dendl; } auto& sv = onode->onode.extent_map_shards; dout(20) << __func__ << " new " << new_shard_info << dendl; dout(20) << __func__ << " old " << sv << dendl; if (sv.empty()) { // no old shards to keep sv.swap(new_shard_info); init_shards(true, true); } else { // splice in new shards sv.erase(sv.begin() + si_begin, sv.begin() + si_end); shards.erase(shards.begin() + si_begin, shards.begin() + si_end); sv.insert( sv.begin() + si_begin, new_shard_info.begin(), new_shard_info.end()); shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard()); si_end = si_begin + new_shard_info.size(); ceph_assert(sv.size() == shards.size()); // note that we need to update every shard_info of shards here, // as sv might have been totally re-allocated above for (unsigned i = 0; i < shards.size(); i++) { shards[i].shard_info = &sv[i]; } // mark newly added shards as dirty for (unsigned i = si_begin; i < si_end; ++i) { shards[i].loaded = true; shards[i].dirty = true; } } dout(20) << __func__ << " fin " << sv << dendl; inline_bl.clear(); if (sv.empty()) { // no more shards; unspan all previously spanning blobs auto p = spanning_blob_map.begin(); while (p != spanning_blob_map.end()) { p->second->id = -1; dout(30) << __func__ << " un-spanning " << *p->second << dendl; p = spanning_blob_map.erase(p); } } else { // identify new spanning blobs dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl; if (spanning_scan_begin < needs_reshard_begin) { fault_range(db, spanning_scan_begin, needs_reshard_begin - spanning_scan_begin); } if (spanning_scan_end > needs_reshard_end) { fault_range(db, needs_reshard_end, spanning_scan_end - needs_reshard_end); } auto sp = sv.begin() + si_begin; auto esp = sv.end(); unsigned shard_start = sp->offset; unsigned shard_end; ++sp; if (sp == esp) { shard_end = OBJECT_MAX_SIZE; } else { shard_end = sp->offset; } Extent dummy(needs_reshard_begin); bool was_too_many_blobs_check = false; auto too_many_blobs_threshold = g_conf()->bluestore_debug_too_many_blobs_threshold; auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes; decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr; decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr; for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) { if (e->logical_offset >= needs_reshard_end) { break; } dout(30) << " extent " << *e << dendl; while (e->logical_offset >= shard_end) { shard_start = shard_end; ceph_assert(sp != esp); ++sp; if (sp == esp) { shard_end = OBJECT_MAX_SIZE; } else { shard_end = sp->offset; } dout(30) << __func__ << " shard 0x" << std::hex << shard_start << " to 0x" << shard_end << std::dec << dendl; } if (e->blob_escapes_range(shard_start, shard_end - shard_start)) { if (!e->blob->is_spanning()) { // We have two options: (1) split the blob into pieces at the // shard boundaries (and adjust extents accordingly), or (2) // mark it spanning. We prefer to cut the blob if we can. Note that // we may have to split it multiple times--potentially at every // shard boundary. bool must_span = false; BlobRef b = e->blob; if (b->can_split()) { uint32_t bstart = e->blob_start(); uint32_t bend = e->blob_end(); for (const auto& sh : shards) { if (bstart < sh.shard_info->offset && bend > sh.shard_info->offset) { uint32_t blob_offset = sh.shard_info->offset - bstart; if (b->can_split_at(blob_offset)) { dout(20) << __func__ << " splitting blob, bstart 0x" << std::hex << bstart << " blob_offset 0x" << blob_offset << std::dec << " " << *b << dendl; b = split_blob(b, blob_offset, sh.shard_info->offset); // switch b to the new right-hand side, in case it // *also* has to get split. bstart += blob_offset; onode->c->store->logger->inc(l_bluestore_blob_split); } else { must_span = true; break; } } } } else { must_span = true; } if (must_span) { auto bid = allocate_spanning_blob_id(); b->id = bid; spanning_blob_map[b->id] = b; dout(20) << __func__ << " adding spanning " << *b << dendl; if (!was_too_many_blobs_check && too_many_blobs_threshold && spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) { was_too_many_blobs_check = true; for (size_t i = 0; i < dumped_onodes.size(); ++i) { if (dumped_onodes[i].first == onode->oid) { oid_slot = &dumped_onodes[i]; break; } if (!oldest_slot || (oldest_slot && dumped_onodes[i].second < oldest_slot->second)) { oldest_slot = &dumped_onodes[i]; } } } } } } else { if (e->blob->is_spanning()) { spanning_blob_map.erase(e->blob->id); e->blob->id = -1; dout(30) << __func__ << " un-spanning " << *e->blob << dendl; } } } bool do_dump = (!oid_slot && was_too_many_blobs_check) || (oid_slot && (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60))); if (do_dump) { dout(0) << __func__ << " spanning blob count exceeds threshold, " << spanning_blob_map.size() << " spanning blobs" << dendl; _dump_onode<0>(cct, *onode); if (oid_slot) { oid_slot->second = mono_clock::now(); } else { ceph_assert(oldest_slot); oldest_slot->first = onode->oid; oldest_slot->second = mono_clock::now(); } } } clear_needs_reshard(); } bool BlueStore::ExtentMap::encode_some( uint32_t offset, uint32_t length, bufferlist& bl, unsigned *pn) { Extent dummy(offset); auto start = extent_map.lower_bound(dummy); uint32_t end = offset + length; __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map // serialization only. Hence there is no specific // handling at ExtentMap level. unsigned n = 0; size_t bound = 0; bool must_reshard = false; for (auto p = start; p != extent_map.end() && p->logical_offset < end; ++p, ++n) { ceph_assert(p->logical_offset >= offset); p->blob->last_encoded_id = -1; if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) { dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << " hit new spanning blob " << *p << dendl; request_reshard(p->blob_start(), p->blob_end()); must_reshard = true; } if (!must_reshard) { denc_varint(0, bound); // blobid denc_varint(0, bound); // logical_offset denc_varint(0, bound); // len denc_varint(0, bound); // blob_offset p->blob->bound_encode( bound, struct_v, p->blob->shared_blob->get_sbid(), false); } } if (must_reshard) { return true; } denc(struct_v, bound); denc_varint(0, bound); // number of extents { auto app = bl.get_contiguous_appender(bound); denc(struct_v, app); denc_varint(n, app); if (pn) { *pn = n; } n = 0; uint64_t pos = 0; uint64_t prev_len = 0; for (auto p = start; p != extent_map.end() && p->logical_offset < end; ++p, ++n) { unsigned blobid; bool include_blob = false; if (p->blob->is_spanning()) { blobid = p->blob->id << BLOBID_SHIFT_BITS; blobid |= BLOBID_FLAG_SPANNING; } else if (p->blob->last_encoded_id < 0) { p->blob->last_encoded_id = n + 1; // so it is always non-zero include_blob = true; blobid = 0; // the decoder will infer the id from n } else { blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS; } if (p->logical_offset == pos) { blobid |= BLOBID_FLAG_CONTIGUOUS; } if (p->blob_offset == 0) { blobid |= BLOBID_FLAG_ZEROOFFSET; } if (p->length == prev_len) { blobid |= BLOBID_FLAG_SAMELENGTH; } else { prev_len = p->length; } denc_varint(blobid, app); if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { denc_varint_lowz(p->logical_offset - pos, app); } if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { denc_varint_lowz(p->blob_offset, app); } if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { denc_varint_lowz(p->length, app); } pos = p->logical_end(); if (include_blob) { p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false); } } } /*derr << __func__ << bl << dendl; derr << __func__ << ":"; bl.hexdump(*_dout); *_dout << dendl; */ return false; } unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl) { /* derr << __func__ << ":"; bl.hexdump(*_dout); *_dout << dendl; */ ceph_assert(bl.get_num_buffers() <= 1); auto p = bl.front().begin_deep(); __u8 struct_v; denc(struct_v, p); // Version 2 differs from v1 in blob's ref_map // serialization only. Hence there is no specific // handling at ExtentMap level below. ceph_assert(struct_v == 1 || struct_v == 2); uint32_t num; denc_varint(num, p); vector blobs(num); uint64_t pos = 0; uint64_t prev_len = 0; unsigned n = 0; while (!p.end()) { Extent *le = new Extent(); uint64_t blobid; denc_varint(blobid, p); if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { uint64_t gap; denc_varint_lowz(gap, p); pos += gap; } le->logical_offset = pos; if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { denc_varint_lowz(le->blob_offset, p); } else { le->blob_offset = 0; } if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { denc_varint_lowz(prev_len, p); } le->length = prev_len; if (blobid & BLOBID_FLAG_SPANNING) { dout(30) << __func__ << " getting spanning blob " << (blobid >> BLOBID_SHIFT_BITS) << dendl; le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS)); } else { blobid >>= BLOBID_SHIFT_BITS; if (blobid) { le->assign_blob(blobs[blobid - 1]); ceph_assert(le->blob); } else { Blob *b = new Blob(); uint64_t sbid = 0; b->decode(onode->c, p, struct_v, &sbid, false); blobs[n] = b; onode->c->open_shared_blob(sbid, b); le->assign_blob(b); } // we build ref_map dynamically for non-spanning blobs le->blob->get_ref( onode->c, le->blob_offset, le->length); } pos += prev_len; ++n; extent_map.insert(*le); } ceph_assert(n == num); return num; } void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p) { // Version 2 differs from v1 in blob's ref_map // serialization only. Hence there is no specific // handling at ExtentMap level. __u8 struct_v = 2; denc(struct_v, p); denc_varint((uint32_t)0, p); size_t key_size = 0; denc_varint((uint32_t)0, key_size); p += spanning_blob_map.size() * key_size; for (const auto& i : spanning_blob_map) { i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true); } } void BlueStore::ExtentMap::encode_spanning_blobs( bufferlist::contiguous_appender& p) { // Version 2 differs from v1 in blob's ref_map // serialization only. Hence there is no specific // handling at ExtentMap level. __u8 struct_v = 2; denc(struct_v, p); denc_varint(spanning_blob_map.size(), p); for (auto& i : spanning_blob_map) { denc_varint(i.second->id, p); i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true); } } void BlueStore::ExtentMap::decode_spanning_blobs( bufferptr::const_iterator& p) { __u8 struct_v; denc(struct_v, p); // Version 2 differs from v1 in blob's ref_map // serialization only. Hence there is no specific // handling at ExtentMap level. ceph_assert(struct_v == 1 || struct_v == 2); unsigned n; denc_varint(n, p); while (n--) { BlobRef b(new Blob()); denc_varint(b->id, p); spanning_blob_map[b->id] = b; uint64_t sbid = 0; b->decode(onode->c, p, struct_v, &sbid, true); onode->c->open_shared_blob(sbid, b); } } void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty) { shards.resize(onode->onode.extent_map_shards.size()); unsigned i = 0; for (auto &s : onode->onode.extent_map_shards) { shards[i].shard_info = &s; shards[i].loaded = loaded; shards[i].dirty = dirty; ++i; } } void BlueStore::ExtentMap::fault_range( KeyValueDB *db, uint32_t offset, uint32_t length) { dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; auto start = seek_shard(offset); auto last = seek_shard(offset + length); if (start < 0) return; ceph_assert(last >= start); string key; while (start <= last) { ceph_assert((size_t)start < shards.size()); auto p = &shards[start]; if (!p->loaded) { dout(30) << __func__ << " opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl; bufferlist v; generate_extent_shard_key_and_apply( onode->key, p->shard_info->offset, &key, [&](const string& final_key) { int r = db->get(PREFIX_OBJ, final_key, &v); if (r < 0) { derr << __func__ << " missing shard 0x" << std::hex << p->shard_info->offset << std::dec << " for " << onode->oid << dendl; ceph_assert(r >= 0); } } ); p->extents = decode_some(v); p->loaded = true; dout(20) << __func__ << " open shard 0x" << std::hex << p->shard_info->offset << " for range 0x" << offset << "~" << length << std::dec << " (" << v.length() << " bytes)" << dendl; ceph_assert(p->dirty == false); ceph_assert(v.length() == p->shard_info->bytes); onode->c->store->logger->inc(l_bluestore_onode_shard_misses); } else { onode->c->store->logger->inc(l_bluestore_onode_shard_hits); } ++start; } } void BlueStore::ExtentMap::dirty_range( uint32_t offset, uint32_t length) { dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; if (shards.empty()) { dout(20) << __func__ << " mark inline shard dirty" << dendl; inline_bl.clear(); return; } auto start = seek_shard(offset); if (length == 0) { length = 1; } auto last = seek_shard(offset + length - 1); if (start < 0) return; ceph_assert(last >= start); while (start <= last) { ceph_assert((size_t)start < shards.size()); auto p = &shards[start]; if (!p->loaded) { derr << __func__ << "on write 0x" << std::hex << offset << "~" << length << " shard 0x" << p->shard_info->offset << std::dec << " is not loaded, can't mark dirty" << dendl; ceph_abort_msg("can't mark unloaded shard dirty"); } if (!p->dirty) { dout(20) << __func__ << " mark shard 0x" << std::hex << p->shard_info->offset << std::dec << " dirty" << dendl; p->dirty = true; } ++start; } } BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find( uint64_t offset) { Extent dummy(offset); return extent_map.find(dummy); } BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent( uint64_t offset) { Extent dummy(offset); auto fp = extent_map.lower_bound(dummy); if (fp != extent_map.begin()) { --fp; if (fp->logical_end() <= offset) { ++fp; } } return fp; } BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent( uint64_t offset) const { Extent dummy(offset); auto fp = extent_map.lower_bound(dummy); if (fp != extent_map.begin()) { --fp; if (fp->logical_end() <= offset) { ++fp; } } return fp; } bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length) { auto fp = seek_lextent(offset); if (fp == extent_map.end() || fp->logical_offset >= offset + length) { return false; } return true; } int BlueStore::ExtentMap::compress_extent_map( uint64_t offset, uint64_t length) { if (extent_map.empty()) return 0; int removed = 0; auto p = seek_lextent(offset); if (p != extent_map.begin()) { --p; // start to the left of offset } // the caller should have just written to this region ceph_assert(p != extent_map.end()); // identify the *next* shard auto pshard = shards.begin(); while (pshard != shards.end() && p->logical_offset >= pshard->shard_info->offset) { ++pshard; } uint64_t shard_end; if (pshard != shards.end()) { shard_end = pshard->shard_info->offset; } else { shard_end = OBJECT_MAX_SIZE; } auto n = p; for (++n; n != extent_map.end(); p = n++) { if (n->logical_offset > offset + length) { break; // stop after end } while (n != extent_map.end() && p->logical_end() == n->logical_offset && p->blob == n->blob && p->blob_offset + p->length == n->blob_offset && n->logical_offset < shard_end) { dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << " next shard 0x" << shard_end << std::dec << " merging " << *p << " and " << *n << dendl; p->length += n->length; rm(n++); ++removed; } if (n == extent_map.end()) { break; } if (n->logical_offset >= shard_end) { ceph_assert(pshard != shards.end()); ++pshard; if (pshard != shards.end()) { shard_end = pshard->shard_info->offset; } else { shard_end = OBJECT_MAX_SIZE; } } } if (removed) { onode->c->store->logger->inc(l_bluestore_extent_compress, removed); } return removed; } void BlueStore::ExtentMap::punch_hole( CollectionRef &c, uint64_t offset, uint64_t length, old_extent_map_t *old_extents) { auto p = seek_lextent(offset); uint64_t end = offset + length; while (p != extent_map.end()) { if (p->logical_offset >= end) { break; } if (p->logical_offset < offset) { if (p->logical_end() > end) { // split and deref middle uint64_t front = offset - p->logical_offset; OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front, length, p->blob); old_extents->push_back(*oe); add(end, p->blob_offset + front + length, p->length - front - length, p->blob); p->length = front; break; } else { // deref tail ceph_assert(p->logical_end() > offset); // else seek_lextent bug uint64_t keep = offset - p->logical_offset; OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep, p->length - keep, p->blob); old_extents->push_back(*oe); p->length = keep; ++p; continue; } } if (p->logical_offset + p->length <= end) { // deref whole lextent OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset, p->length, p->blob); old_extents->push_back(*oe); rm(p++); continue; } // deref head uint64_t keep = p->logical_end() - end; OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset, p->length - keep, p->blob); old_extents->push_back(*oe); add(end, p->blob_offset + p->length - keep, keep, p->blob); rm(p); break; } } BlueStore::Extent *BlueStore::ExtentMap::set_lextent( CollectionRef &c, uint64_t logical_offset, uint64_t blob_offset, uint64_t length, BlobRef b, old_extent_map_t *old_extents) { // We need to have completely initialized Blob to increment its ref counters. ceph_assert(b->get_blob().get_logical_length() != 0); // Do get_ref prior to punch_hole to prevent from putting reused blob into // old_extents list if we overwre the blob totally // This might happen during WAL overwrite. b->get_ref(onode->c, blob_offset, length); if (old_extents) { punch_hole(c, logical_offset, length, old_extents); } Extent *le = new Extent(logical_offset, blob_offset, length, b); extent_map.insert(*le); if (spans_shard(logical_offset, length)) { request_reshard(logical_offset, logical_offset + length); } return le; } BlueStore::BlobRef BlueStore::ExtentMap::split_blob( BlobRef lb, uint32_t blob_offset, uint32_t pos) { uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset; dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos << " blob_offset 0x" << blob_offset << std::dec << " " << *lb << dendl; BlobRef rb = onode->c->new_blob(); lb->split(onode->c, blob_offset, rb.get()); for (auto ep = seek_lextent(pos); ep != extent_map.end() && ep->logical_offset < end_pos; ++ep) { if (ep->blob != lb) { continue; } if (ep->logical_offset < pos) { // split extent size_t left = pos - ep->logical_offset; Extent *ne = new Extent(pos, 0, ep->length - left, rb); extent_map.insert(*ne); ep->length = left; dout(30) << __func__ << " split " << *ep << dendl; dout(30) << __func__ << " to " << *ne << dendl; } else { // switch blob ceph_assert(ep->blob_offset >= blob_offset); ep->blob = rb; ep->blob_offset -= blob_offset; dout(30) << __func__ << " adjusted " << *ep << dendl; } } return rb; } // Onode #undef dout_prefix #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " " void BlueStore::Onode::get() { if (++nref >= 2 && !pinned) { OnodeCacheShard* ocs = c->get_onode_cache(); ocs->lock.lock(); // It is possible that during waiting split_cache moved us to different OnodeCacheShard. while (ocs != c->get_onode_cache()) { ocs->lock.unlock(); ocs = c->get_onode_cache(); ocs->lock.lock(); } bool was_pinned = pinned; pinned = nref >= 2; bool r = !was_pinned && pinned; if (cached && r) { ocs->_pin(this); } ocs->lock.unlock(); } } void BlueStore::Onode::put() { ++put_nref; int n = --nref; if (n == 1) { OnodeCacheShard* ocs = c->get_onode_cache(); ocs->lock.lock(); // It is possible that during waiting split_cache moved us to different OnodeCacheShard. while (ocs != c->get_onode_cache()) { ocs->lock.unlock(); ocs = c->get_onode_cache(); ocs->lock.lock(); } bool need_unpin = pinned; pinned = pinned && nref >= 2; need_unpin = need_unpin && !pinned; if (cached && need_unpin) { if (exists) { ocs->_unpin(this); } else { ocs->_unpin_and_rm(this); // remove will also decrement nref c->onode_map._remove(oid); } } ocs->lock.unlock(); } auto pn = --put_nref; if (nref == 0 && pn == 0) { delete this; } } BlueStore::Onode* BlueStore::Onode::decode( CollectionRef c, const ghobject_t& oid, const string& key, const bufferlist& v) { Onode* on = new Onode(c.get(), oid, key); on->exists = true; auto p = v.front().begin_deep(); on->onode.decode(p); for (auto& i : on->onode.attrs) { i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); } // initialize extent_map on->extent_map.decode_spanning_blobs(p); if (on->onode.extent_map_shards.empty()) { denc(on->extent_map.inline_bl, p); on->extent_map.decode_some(on->extent_map.inline_bl); on->extent_map.inline_bl.reassign_to_mempool( mempool::mempool_bluestore_cache_data); } else { on->extent_map.init_shards(false, false); } return on; } void BlueStore::Onode::flush() { if (flushing_count.load()) { ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl; waiting_count++; std::unique_lock l(flush_lock); while (flushing_count.load()) { flush_cond.wait(l); } waiting_count--; } ldout(c->store->cct, 20) << __func__ << " done" << dendl; } void BlueStore::Onode::dump(Formatter* f) const { onode.dump(f); extent_map.dump(f); } const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags) { if (bluestore_onode_t::is_pgmeta_omap(flags)) { return PREFIX_PGMETA_OMAP; } if (bluestore_onode_t::is_perpg_omap(flags)) { return PREFIX_PERPG_OMAP; } if (bluestore_onode_t::is_perpool_omap(flags)) { return PREFIX_PERPOOL_OMAP; } return PREFIX_OMAP; } // '-' < '.' < '~' void BlueStore::Onode::calc_omap_header( uint8_t flags, const Onode* o, std::string* out) { if (!bluestore_onode_t::is_pgmeta_omap(flags)) { if (bluestore_onode_t::is_perpg_omap(flags)) { _key_encode_u64(o->c->pool(), out); _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); } else if (bluestore_onode_t::is_perpool_omap(flags)) { _key_encode_u64(o->c->pool(), out); } } _key_encode_u64(o->onode.nid, out); out->push_back('-'); } void BlueStore::Onode::calc_omap_key(uint8_t flags, const Onode* o, const std::string& key, std::string* out) { if (!bluestore_onode_t::is_pgmeta_omap(flags)) { if (bluestore_onode_t::is_perpg_omap(flags)) { _key_encode_u64(o->c->pool(), out); _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); } else if (bluestore_onode_t::is_perpool_omap(flags)) { _key_encode_u64(o->c->pool(), out); } } _key_encode_u64(o->onode.nid, out); out->push_back('.'); out->append(key); } void BlueStore::Onode::rewrite_omap_key(const string& old, string *out) { if (!onode.is_pgmeta_omap()) { if (onode.is_perpg_omap()) { _key_encode_u64(c->pool(), out); _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out); } else if (onode.is_perpool_omap()) { _key_encode_u64(c->pool(), out); } } _key_encode_u64(onode.nid, out); out->append(old.c_str() + out->length(), old.size() - out->length()); } void BlueStore::Onode::calc_omap_tail( uint8_t flags, const Onode* o, std::string* out) { if (!bluestore_onode_t::is_pgmeta_omap(flags)) { if (bluestore_onode_t::is_perpg_omap(flags)) { _key_encode_u64(o->c->pool(), out); _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); } else if (bluestore_onode_t::is_perpool_omap(flags)) { _key_encode_u64(o->c->pool(), out); } } _key_encode_u64(o->onode.nid, out); out->push_back('~'); } void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) { size_t pos = sizeof(uint64_t) + 1; if (!onode.is_pgmeta_omap()) { if (onode.is_perpg_omap()) { pos += sizeof(uint64_t) + sizeof(uint32_t); } else if (onode.is_perpool_omap()) { pos += sizeof(uint64_t); } } *user_key = key.substr(pos); } // ======================================================= // WriteContext /// Checks for writes to the same pextent within a blob bool BlueStore::WriteContext::has_conflict( BlobRef b, uint64_t loffs, uint64_t loffs_end, uint64_t min_alloc_size) { ceph_assert((loffs % min_alloc_size) == 0); ceph_assert((loffs_end % min_alloc_size) == 0); for (auto w : writes) { if (b == w.b) { auto loffs2 = p2align(w.logical_offset, min_alloc_size); auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size); if ((loffs <= loffs2 && loffs_end > loffs2) || (loffs >= loffs2 && loffs < loffs2_end)) { return true; } } } return false; } // ======================================================= // DeferredBatch #undef dout_prefix #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") " #undef dout_context #define dout_context cct void BlueStore::DeferredBatch::prepare_write( CephContext *cct, uint64_t seq, uint64_t offset, uint64_t length, bufferlist::const_iterator& blp) { _discard(cct, offset, length); auto i = iomap.insert(make_pair(offset, deferred_io())); ceph_assert(i.second); // this should be a new insertion i.first->second.seq = seq; blp.copy(length, i.first->second.bl); i.first->second.bl.reassign_to_mempool( mempool::mempool_bluestore_writing_deferred); dout(20) << __func__ << " seq " << seq << " 0x" << std::hex << offset << "~" << length << " crc " << i.first->second.bl.crc32c(-1) << std::dec << dendl; seq_bytes[seq] += length; #ifdef DEBUG_DEFERRED _audit(cct); #endif } void BlueStore::DeferredBatch::_discard( CephContext *cct, uint64_t offset, uint64_t length) { generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; auto p = iomap.lower_bound(offset); if (p != iomap.begin()) { --p; auto end = p->first + p->second.bl.length(); if (end > offset) { bufferlist head; head.substr_of(p->second.bl, 0, offset - p->first); dout(20) << __func__ << " keep head " << p->second.seq << " 0x" << std::hex << p->first << "~" << p->second.bl.length() << " -> 0x" << head.length() << std::dec << dendl; auto i = seq_bytes.find(p->second.seq); ceph_assert(i != seq_bytes.end()); if (end > offset + length) { bufferlist tail; tail.substr_of(p->second.bl, offset + length - p->first, end - (offset + length)); dout(20) << __func__ << " keep tail " << p->second.seq << " 0x" << std::hex << p->first << "~" << p->second.bl.length() << " -> 0x" << tail.length() << std::dec << dendl; auto &n = iomap[offset + length]; n.bl.swap(tail); n.seq = p->second.seq; i->second -= length; } else { i->second -= end - offset; } ceph_assert(i->second >= 0); p->second.bl.swap(head); } ++p; } while (p != iomap.end()) { if (p->first >= offset + length) { break; } auto i = seq_bytes.find(p->second.seq); ceph_assert(i != seq_bytes.end()); auto end = p->first + p->second.bl.length(); if (end > offset + length) { unsigned drop_front = offset + length - p->first; unsigned keep_tail = end - (offset + length); dout(20) << __func__ << " truncate front " << p->second.seq << " 0x" << std::hex << p->first << "~" << p->second.bl.length() << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail << " to 0x" << (offset + length) << "~" << keep_tail << std::dec << dendl; auto &s = iomap[offset + length]; s.seq = p->second.seq; s.bl.substr_of(p->second.bl, drop_front, keep_tail); i->second -= drop_front; } else { dout(20) << __func__ << " drop " << p->second.seq << " 0x" << std::hex << p->first << "~" << p->second.bl.length() << std::dec << dendl; i->second -= p->second.bl.length(); } ceph_assert(i->second >= 0); p = iomap.erase(p); } } void BlueStore::DeferredBatch::_audit(CephContext *cct) { map sb; for (auto p : seq_bytes) { sb[p.first] = 0; // make sure we have the same set of keys } uint64_t pos = 0; for (auto& p : iomap) { ceph_assert(p.first >= pos); sb[p.second.seq] += p.second.bl.length(); pos = p.first + p.second.bl.length(); } ceph_assert(sb == seq_bytes); } // Collection #undef dout_prefix #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") " BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid) : CollectionImpl(store_->cct, cid), store(store_), cache(bc), exists(true), onode_map(oc), commit_queue(nullptr) { } bool BlueStore::Collection::flush_commit(Context *c) { return osr->flush_commit(c); } void BlueStore::Collection::flush() { osr->flush(); } void BlueStore::Collection::flush_all_but_last() { osr->flush_all_but_last(); } void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b) { ceph_assert(!b->shared_blob); const bluestore_blob_t& blob = b->get_blob(); if (!blob.is_shared()) { b->shared_blob = new SharedBlob(this); return; } b->shared_blob = shared_blob_set.lookup(sbid); if (b->shared_blob) { ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid << std::dec << " had " << *b->shared_blob << dendl; } else { b->shared_blob = new SharedBlob(sbid, this); shared_blob_set.add(this, b->shared_blob.get()); ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid << std::dec << " opened " << *b->shared_blob << dendl; } } void BlueStore::Collection::load_shared_blob(SharedBlobRef sb) { if (!sb->is_loaded()) { bufferlist v; string key; auto sbid = sb->get_sbid(); get_shared_blob_key(sbid, &key); int r = store->db->get(PREFIX_SHARED_BLOB, key, &v); if (r < 0) { lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid << std::dec << " not found at key " << pretty_binary_string(key) << dendl; ceph_abort_msg("uh oh, missing shared_blob"); } sb->loaded = true; sb->persistent = new bluestore_shared_blob_t(sbid); auto p = v.cbegin(); decode(*(sb->persistent), p); ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid << std::dec << " loaded shared_blob " << *sb << dendl; } } void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b) { ldout(store->cct, 10) << __func__ << " " << *b << dendl; ceph_assert(!b->shared_blob->is_loaded()); // update blob bluestore_blob_t& blob = b->dirty_blob(); blob.set_flag(bluestore_blob_t::FLAG_SHARED); // update shared blob b->shared_blob->loaded = true; b->shared_blob->persistent = new bluestore_shared_blob_t(sbid); shared_blob_set.add(this, b->shared_blob.get()); for (auto p : blob.get_extents()) { if (p.is_valid()) { b->shared_blob->get_ref( p.offset, p.length); } } ldout(store->cct, 20) << __func__ << " now " << *b << dendl; } uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb) { ldout(store->cct, 10) << __func__ << " " << *sb << dendl; ceph_assert(sb->is_loaded()); uint64_t sbid = sb->get_sbid(); shared_blob_set.remove(sb); sb->loaded = false; delete sb->persistent; sb->sbid_unloaded = 0; ldout(store->cct, 20) << __func__ << " now " << *sb << dendl; return sbid; } BlueStore::OnodeRef BlueStore::Collection::get_onode( const ghobject_t& oid, bool create, bool is_createop) { ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock)); spg_t pgid; if (cid.is_pg(&pgid)) { if (!oid.match(cnode.bits, pgid.ps())) { lderr(store->cct) << __func__ << " oid " << oid << " not part of " << pgid << " bits " << cnode.bits << dendl; ceph_abort(); } } OnodeRef o = onode_map.lookup(oid); if (o) return o; string key; get_object_key(store->cct, oid, &key); ldout(store->cct, 20) << __func__ << " oid " << oid << " key " << pretty_binary_string(key) << dendl; bufferlist v; int r = -ENOENT; Onode *on; if (!is_createop) { r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v); ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl; } if (v.length() == 0) { ceph_assert(r == -ENOENT); if (!create) return OnodeRef(); // new object, new onode on = new Onode(this, oid, key); } else { // loaded ceph_assert(r >= 0); on = Onode::decode(this, oid, key, v); } o.reset(on); return onode_map.add(oid, o); } void BlueStore::Collection::split_cache( Collection *dest) { ldout(store->cct, 10) << __func__ << " to " << dest << dendl; auto *ocache = get_onode_cache(); auto *ocache_dest = dest->get_onode_cache(); // lock cache shards std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock); std::lock_guard l(ocache->lock, std::adopt_lock); std::lock_guard l2(ocache_dest->lock, std::adopt_lock); std::lock_guard l3(cache->lock, std::adopt_lock); std::lock_guard l4(dest->cache->lock, std::adopt_lock); int destbits = dest->cnode.bits; spg_t destpg; bool is_pg = dest->cid.is_pg(&destpg); ceph_assert(is_pg); auto p = onode_map.onode_map.begin(); while (p != onode_map.onode_map.end()) { OnodeRef o = p->second; if (!p->second->oid.match(destbits, destpg.pgid.ps())) { // onode does not belong to this child ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid << dendl; ++p; } else { ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid << dendl; // ensuring that nref is always >= 2 and hence onode is pinned and // physically out of cache during the transition OnodeRef o_pin = o; ceph_assert(o->pinned); p = onode_map.onode_map.erase(p); dest->onode_map.onode_map[o->oid] = o; if (o->cached) { get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get()); } o->c = dest; // move over shared blobs and buffers. cover shared blobs from // both extent map and spanning blob map (the full extent map // may not be faulted in) vector sbvec; for (auto& e : o->extent_map.extent_map) { sbvec.push_back(e.blob->shared_blob.get()); } for (auto& b : o->extent_map.spanning_blob_map) { sbvec.push_back(b.second->shared_blob.get()); } for (auto sb : sbvec) { if (sb->coll == dest) { ldout(store->cct, 20) << __func__ << " already moved " << *sb << dendl; continue; } ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl; if (sb->get_sbid()) { ldout(store->cct, 20) << __func__ << " moving registration " << *sb << dendl; shared_blob_set.remove(sb); dest->shared_blob_set.add(dest, sb); } sb->coll = dest; if (dest->cache != cache) { for (auto& i : sb->bc.buffer_map) { if (!i.second->is_writing()) { ldout(store->cct, 20) << __func__ << " moving " << *i.second << dendl; dest->cache->_move(cache, i.second.get()); } } } } } } dest->cache->_trim(); } // ======================================================= // MempoolThread #undef dout_prefix #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") " #undef dout_context #define dout_context store->cct void *BlueStore::MempoolThread::entry() { std::unique_lock l{lock}; uint32_t prev_config_change = store->config_changed.load(); uint64_t base = store->osd_memory_base; double fragmentation = store->osd_memory_expected_fragmentation; uint64_t target = store->osd_memory_target; uint64_t min = store->osd_memory_cache_min; uint64_t max = min; // When setting the maximum amount of memory to use for cache, first // assume some base amount of memory for the OSD and then fudge in // some overhead for fragmentation that scales with cache usage. uint64_t ltarget = (1.0 - fragmentation) * target; if (ltarget > base + min) { max = ltarget - base; } binned_kv_cache = store->db->get_priority_cache(); binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ); if (store->cache_autotune && binned_kv_cache != nullptr) { pcm = std::make_shared( store->cct, min, max, target, true, "bluestore-pricache"); pcm->insert("kv", binned_kv_cache, true); pcm->insert("meta", meta_cache, true); pcm->insert("data", data_cache, true); if (binned_kv_onode_cache != nullptr) { pcm->insert("kv_onode", binned_kv_onode_cache, true); } } utime_t next_balance = ceph_clock_now(); utime_t next_resize = ceph_clock_now(); utime_t next_deferred_force_submit = ceph_clock_now(); utime_t alloc_stats_dump_clock = ceph_clock_now(); bool interval_stats_trim = false; while (!stop) { // Update pcm cache settings if related configuration was changed uint32_t cur_config_change = store->config_changed.load(); if (cur_config_change != prev_config_change) { _update_cache_settings(); prev_config_change = cur_config_change; } // Before we trim, check and see if it's time to rebalance/resize. double autotune_interval = store->cache_autotune_interval; double resize_interval = store->osd_memory_cache_resize_interval; double max_defer_interval = store->max_defer_interval; double alloc_stats_dump_interval = store->cct->_conf->bluestore_alloc_stats_dump_interval; if (alloc_stats_dump_interval > 0 && alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) { store->_record_allocation_stats(); alloc_stats_dump_clock = ceph_clock_now(); } if (autotune_interval > 0 && next_balance < ceph_clock_now()) { _adjust_cache_settings(); // Log events at 5 instead of 20 when balance happens. interval_stats_trim = true; if (pcm != nullptr) { pcm->balance(); } next_balance = ceph_clock_now(); next_balance += autotune_interval; } if (resize_interval > 0 && next_resize < ceph_clock_now()) { if (ceph_using_tcmalloc() && pcm != nullptr) { pcm->tune_memory(); } next_resize = ceph_clock_now(); next_resize += resize_interval; } if (max_defer_interval > 0 && next_deferred_force_submit < ceph_clock_now()) { if (store->get_deferred_last_submitted() + max_defer_interval < ceph_clock_now()) { store->deferred_try_submit(); } next_deferred_force_submit = ceph_clock_now(); next_deferred_force_submit += max_defer_interval/3; } // Now Resize the shards _resize_shards(interval_stats_trim); interval_stats_trim = false; store->_update_cache_logger(); auto wait = ceph::make_timespan( store->cct->_conf->bluestore_cache_trim_interval); cond.wait_for(l, wait); } // do final dump store->_record_allocation_stats(); stop = false; pcm = nullptr; return NULL; } void BlueStore::MempoolThread::_adjust_cache_settings() { if (binned_kv_cache != nullptr) { binned_kv_cache->set_cache_ratio(store->cache_kv_ratio); } if (binned_kv_onode_cache != nullptr) { binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio); } meta_cache->set_cache_ratio(store->cache_meta_ratio); data_cache->set_cache_ratio(store->cache_data_ratio); } void BlueStore::MempoolThread::_resize_shards(bool interval_stats) { size_t onode_shards = store->onode_cache_shards.size(); size_t buffer_shards = store->buffer_cache_shards.size(); int64_t kv_used = store->db->get_cache_usage(); int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ); int64_t meta_used = meta_cache->_get_used_bytes(); int64_t data_used = data_cache->_get_used_bytes(); uint64_t cache_size = store->cache_size; int64_t kv_alloc = static_cast(store->cache_kv_ratio * cache_size); int64_t kv_onode_alloc = static_cast(store->cache_kv_onode_ratio * cache_size); int64_t meta_alloc = static_cast(store->cache_meta_ratio * cache_size); int64_t data_alloc = static_cast(store->cache_data_ratio * cache_size); if (pcm != nullptr && binned_kv_cache != nullptr) { cache_size = pcm->get_tuned_mem(); kv_alloc = binned_kv_cache->get_committed_size(); meta_alloc = meta_cache->get_committed_size(); data_alloc = data_cache->get_committed_size(); if (binned_kv_onode_cache != nullptr) { kv_onode_alloc = binned_kv_onode_cache->get_committed_size(); } } if (interval_stats) { dout(5) << __func__ << " cache_size: " << cache_size << " kv_alloc: " << kv_alloc << " kv_used: " << kv_used << " kv_onode_alloc: " << kv_onode_alloc << " kv_onode_used: " << kv_onode_used << " meta_alloc: " << meta_alloc << " meta_used: " << meta_used << " data_alloc: " << data_alloc << " data_used: " << data_used << dendl; } else { dout(20) << __func__ << " cache_size: " << cache_size << " kv_alloc: " << kv_alloc << " kv_used: " << kv_used << " kv_onode_alloc: " << kv_onode_alloc << " kv_onode_used: " << kv_onode_used << " meta_alloc: " << meta_alloc << " meta_used: " << meta_used << " data_alloc: " << data_alloc << " data_used: " << data_used << dendl; } uint64_t max_shard_onodes = static_cast( (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode()); uint64_t max_shard_buffer = static_cast(data_alloc / buffer_shards); dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes << " max_shard_buffer: " << max_shard_buffer << dendl; for (auto i : store->onode_cache_shards) { i->set_max(max_shard_onodes); } for (auto i : store->buffer_cache_shards) { i->set_max(max_shard_buffer); } } void BlueStore::MempoolThread::_update_cache_settings() { // Nothing to do if pcm is not used. if (pcm == nullptr) { return; } uint64_t target = store->osd_memory_target; uint64_t base = store->osd_memory_base; uint64_t min = store->osd_memory_cache_min; uint64_t max = min; double fragmentation = store->osd_memory_expected_fragmentation; uint64_t ltarget = (1.0 - fragmentation) * target; if (ltarget > base + min) { max = ltarget - base; } // set pcm cache levels pcm->set_target_memory(target); pcm->set_min_memory(min); pcm->set_max_memory(max); dout(5) << __func__ << " updated pcm target: " << target << " pcm min: " << min << " pcm max: " << max << dendl; } // ======================================================= // OmapIteratorImpl #undef dout_prefix #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") " BlueStore::OmapIteratorImpl::OmapIteratorImpl( CollectionRef c, OnodeRef o, KeyValueDB::Iterator it) : c(c), o(o), it(it) { std::shared_lock l(c->lock); if (o->onode.has_omap()) { o->get_omap_key(string(), &head); o->get_omap_tail(&tail); it->lower_bound(head); } } string BlueStore::OmapIteratorImpl::_stringify() const { stringstream s; s << " omap_iterator(cid = " << c->cid <<", oid = " << o->oid << ")"; return s.str(); } int BlueStore::OmapIteratorImpl::seek_to_first() { std::shared_lock l(c->lock); auto start1 = mono_clock::now(); if (o->onode.has_omap()) { it->lower_bound(head); } else { it = KeyValueDB::Iterator(); } c->store->log_latency( __func__, l_bluestore_omap_seek_to_first_lat, mono_clock::now() - start1, c->store->cct->_conf->bluestore_log_omap_iterator_age); return 0; } int BlueStore::OmapIteratorImpl::upper_bound(const string& after) { std::shared_lock l(c->lock); auto start1 = mono_clock::now(); if (o->onode.has_omap()) { string key; o->get_omap_key(after, &key); ldout(c->store->cct,20) << __func__ << " after " << after << " key " << pretty_binary_string(key) << dendl; it->upper_bound(key); } else { it = KeyValueDB::Iterator(); } c->store->log_latency_fn( __func__, l_bluestore_omap_upper_bound_lat, mono_clock::now() - start1, c->store->cct->_conf->bluestore_log_omap_iterator_age, [&] (const ceph::timespan& lat) { return ", after = " + after + _stringify(); } ); return 0; } int BlueStore::OmapIteratorImpl::lower_bound(const string& to) { std::shared_lock l(c->lock); auto start1 = mono_clock::now(); if (o->onode.has_omap()) { string key; o->get_omap_key(to, &key); ldout(c->store->cct,20) << __func__ << " to " << to << " key " << pretty_binary_string(key) << dendl; it->lower_bound(key); } else { it = KeyValueDB::Iterator(); } c->store->log_latency_fn( __func__, l_bluestore_omap_lower_bound_lat, mono_clock::now() - start1, c->store->cct->_conf->bluestore_log_omap_iterator_age, [&] (const ceph::timespan& lat) { return ", to = " + to + _stringify(); } ); return 0; } bool BlueStore::OmapIteratorImpl::valid() { std::shared_lock l(c->lock); bool r = o->onode.has_omap() && it && it->valid() && it->raw_key().second < tail; if (it && it->valid()) { ldout(c->store->cct,20) << __func__ << " is at " << pretty_binary_string(it->raw_key().second) << dendl; } return r; } int BlueStore::OmapIteratorImpl::next() { int r = -1; std::shared_lock l(c->lock); auto start1 = mono_clock::now(); if (o->onode.has_omap()) { it->next(); r = 0; } c->store->log_latency( __func__, l_bluestore_omap_next_lat, mono_clock::now() - start1, c->store->cct->_conf->bluestore_log_omap_iterator_age); return r; } string BlueStore::OmapIteratorImpl::key() { std::shared_lock l(c->lock); ceph_assert(it->valid()); string db_key = it->raw_key().second; string user_key; o->decode_omap_key(db_key, &user_key); return user_key; } bufferlist BlueStore::OmapIteratorImpl::value() { std::shared_lock l(c->lock); ceph_assert(it->valid()); return it->value(); } // ===================================== #undef dout_prefix #define dout_prefix *_dout << "bluestore(" << path << ") " #undef dout_context #define dout_context cct static void aio_cb(void *priv, void *priv2) { BlueStore *store = static_cast(priv); BlueStore::AioContext *c = static_cast(priv2); c->aio_finish(store); } static void discard_cb(void *priv, void *priv2) { BlueStore *store = static_cast(priv); interval_set *tmp = static_cast*>(priv2); store->handle_discard(*tmp); } void BlueStore::handle_discard(interval_set& to_release) { dout(10) << __func__ << dendl; ceph_assert(shared_alloc.a); shared_alloc.a->release(to_release); } BlueStore::BlueStore(CephContext *cct, const string& path) : BlueStore(cct, path, 0) {} BlueStore::BlueStore(CephContext *cct, const string& path, uint64_t _min_alloc_size) : ObjectStore(cct, path), throttle(cct), finisher(cct, "commit_finisher", "cfin"), kv_sync_thread(this), kv_finalize_thread(this), zoned_cleaner_thread(this), min_alloc_size(_min_alloc_size), min_alloc_size_order(ctz(_min_alloc_size)), mempool_thread(this) { _init_logger(); cct->_conf.add_observer(this); set_cache_shards(1); } BlueStore::~BlueStore() { cct->_conf.remove_observer(this); _shutdown_logger(); ceph_assert(!mounted); ceph_assert(db == NULL); ceph_assert(bluefs == NULL); ceph_assert(fsid_fd < 0); ceph_assert(path_fd < 0); for (auto i : onode_cache_shards) { delete i; } for (auto i : buffer_cache_shards) { delete i; } onode_cache_shards.clear(); buffer_cache_shards.clear(); } const char **BlueStore::get_tracked_conf_keys() const { static const char* KEYS[] = { "bluestore_csum_type", "bluestore_compression_mode", "bluestore_compression_algorithm", "bluestore_compression_min_blob_size", "bluestore_compression_min_blob_size_ssd", "bluestore_compression_min_blob_size_hdd", "bluestore_compression_max_blob_size", "bluestore_compression_max_blob_size_ssd", "bluestore_compression_max_blob_size_hdd", "bluestore_compression_required_ratio", "bluestore_max_alloc_size", "bluestore_prefer_deferred_size", "bluestore_prefer_deferred_size_hdd", "bluestore_prefer_deferred_size_ssd", "bluestore_deferred_batch_ops", "bluestore_deferred_batch_ops_hdd", "bluestore_deferred_batch_ops_ssd", "bluestore_throttle_bytes", "bluestore_throttle_deferred_bytes", "bluestore_throttle_cost_per_io_hdd", "bluestore_throttle_cost_per_io_ssd", "bluestore_throttle_cost_per_io", "bluestore_max_blob_size", "bluestore_max_blob_size_ssd", "bluestore_max_blob_size_hdd", "osd_memory_target", "osd_memory_target_cgroup_limit_ratio", "osd_memory_base", "osd_memory_cache_min", "osd_memory_expected_fragmentation", "bluestore_cache_autotune", "bluestore_cache_autotune_interval", "bluestore_warn_on_legacy_statfs", "bluestore_warn_on_no_per_pool_omap", "bluestore_max_defer_interval", NULL }; return KEYS; } void BlueStore::handle_conf_change(const ConfigProxy& conf, const std::set &changed) { if (changed.count("bluestore_warn_on_legacy_statfs")) { _check_legacy_statfs_alert(); } if (changed.count("bluestore_warn_on_no_per_pool_omap") || changed.count("bluestore_warn_on_no_per_pg_omap")) { _check_no_per_pg_or_pool_omap_alert(); } if (changed.count("bluestore_csum_type")) { _set_csum(); } if (changed.count("bluestore_compression_mode") || changed.count("bluestore_compression_algorithm") || changed.count("bluestore_compression_min_blob_size") || changed.count("bluestore_compression_max_blob_size")) { if (bdev) { _set_compression(); } } if (changed.count("bluestore_max_blob_size") || changed.count("bluestore_max_blob_size_ssd") || changed.count("bluestore_max_blob_size_hdd")) { if (bdev) { // only after startup _set_blob_size(); } } if (changed.count("bluestore_prefer_deferred_size") || changed.count("bluestore_prefer_deferred_size_hdd") || changed.count("bluestore_prefer_deferred_size_ssd") || changed.count("bluestore_max_alloc_size") || changed.count("bluestore_deferred_batch_ops") || changed.count("bluestore_deferred_batch_ops_hdd") || changed.count("bluestore_deferred_batch_ops_ssd")) { if (bdev) { // only after startup _set_alloc_sizes(); } } if (changed.count("bluestore_throttle_cost_per_io") || changed.count("bluestore_throttle_cost_per_io_hdd") || changed.count("bluestore_throttle_cost_per_io_ssd")) { if (bdev) { _set_throttle_params(); } } if (changed.count("bluestore_throttle_bytes") || changed.count("bluestore_throttle_deferred_bytes") || changed.count("bluestore_throttle_trace_rate")) { throttle.reset_throttle(conf); } if (changed.count("bluestore_max_defer_interval")) { if (bdev) { _set_max_defer_interval(); } } if (changed.count("osd_memory_target") || changed.count("osd_memory_base") || changed.count("osd_memory_cache_min") || changed.count("osd_memory_expected_fragmentation")) { _update_osd_memory_options(); } } void BlueStore::_set_compression() { auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode); if (m) { _clear_compression_alert(); comp_mode = *m; } else { derr << __func__ << " unrecognized value '" << cct->_conf->bluestore_compression_mode << "' for bluestore_compression_mode, reverting to 'none'" << dendl; comp_mode = Compressor::COMP_NONE; string s("unknown mode: "); s += cct->_conf->bluestore_compression_mode; _set_compression_alert(true, s.c_str()); } compressor = nullptr; if (cct->_conf->bluestore_compression_min_blob_size) { comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size; } else { ceph_assert(bdev); if (_use_rotational_settings()) { comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd; } else { comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd; } } if (cct->_conf->bluestore_compression_max_blob_size) { comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size; } else { ceph_assert(bdev); if (_use_rotational_settings()) { comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd; } else { comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd; } } auto& alg_name = cct->_conf->bluestore_compression_algorithm; if (!alg_name.empty()) { compressor = Compressor::create(cct, alg_name); if (!compressor) { derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor" << dendl; _set_compression_alert(false, alg_name.c_str()); } } dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode) << " alg " << (compressor ? compressor->get_type_name() : "(none)") << " min_blob " << comp_min_blob_size << " max_blob " << comp_max_blob_size << dendl; } void BlueStore::_set_csum() { csum_type = Checksummer::CSUM_NONE; int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type); if (t > Checksummer::CSUM_NONE) csum_type = t; dout(10) << __func__ << " csum_type " << Checksummer::get_csum_type_string(csum_type) << dendl; } void BlueStore::_set_throttle_params() { if (cct->_conf->bluestore_throttle_cost_per_io) { throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io; } else { ceph_assert(bdev); if (_use_rotational_settings()) { throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd; } else { throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd; } } dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io << dendl; } void BlueStore::_set_blob_size() { if (cct->_conf->bluestore_max_blob_size) { max_blob_size = cct->_conf->bluestore_max_blob_size; } else { ceph_assert(bdev); if (_use_rotational_settings()) { max_blob_size = cct->_conf->bluestore_max_blob_size_hdd; } else { max_blob_size = cct->_conf->bluestore_max_blob_size_ssd; } } dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size << std::dec << dendl; } void BlueStore::_update_osd_memory_options() { osd_memory_target = cct->_conf.get_val("osd_memory_target"); osd_memory_base = cct->_conf.get_val("osd_memory_base"); osd_memory_expected_fragmentation = cct->_conf.get_val("osd_memory_expected_fragmentation"); osd_memory_cache_min = cct->_conf.get_val("osd_memory_cache_min"); config_changed++; dout(10) << __func__ << " osd_memory_target " << osd_memory_target << " osd_memory_base " << osd_memory_base << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation << " osd_memory_cache_min " << osd_memory_cache_min << dendl; } int BlueStore::_set_cache_sizes() { ceph_assert(bdev); cache_autotune = cct->_conf.get_val("bluestore_cache_autotune"); cache_autotune_interval = cct->_conf.get_val("bluestore_cache_autotune_interval"); osd_memory_target = cct->_conf.get_val("osd_memory_target"); osd_memory_base = cct->_conf.get_val("osd_memory_base"); osd_memory_expected_fragmentation = cct->_conf.get_val("osd_memory_expected_fragmentation"); osd_memory_cache_min = cct->_conf.get_val("osd_memory_cache_min"); osd_memory_cache_resize_interval = cct->_conf.get_val("osd_memory_cache_resize_interval"); if (cct->_conf->bluestore_cache_size) { cache_size = cct->_conf->bluestore_cache_size; } else { // choose global cache size based on backend type if (_use_rotational_settings()) { cache_size = cct->_conf->bluestore_cache_size_hdd; } else { cache_size = cct->_conf->bluestore_cache_size_ssd; } } cache_meta_ratio = cct->_conf.get_val("bluestore_cache_meta_ratio"); if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) { derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio << ") must be in range [0,1.0]" << dendl; return -EINVAL; } cache_kv_ratio = cct->_conf.get_val("bluestore_cache_kv_ratio"); if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) { derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio << ") must be in range [0,1.0]" << dendl; return -EINVAL; } cache_kv_onode_ratio = cct->_conf.get_val("bluestore_cache_kv_onode_ratio"); if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) { derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio << ") must be in range [0,1.0]" << dendl; return -EINVAL; } if (cache_meta_ratio + cache_kv_ratio > 1.0) { derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0" << dendl; return -EINVAL; } cache_data_ratio = (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio - (double)cache_kv_onode_ratio; if (cache_data_ratio < 0) { // deal with floating point imprecision cache_data_ratio = 0; } dout(1) << __func__ << " cache_size " << cache_size << " meta " << cache_meta_ratio << " kv " << cache_kv_ratio << " data " << cache_data_ratio << dendl; return 0; } int BlueStore::write_meta(const std::string& key, const std::string& value) { bluestore_bdev_label_t label; string p = path + "/block"; int r = _read_bdev_label(cct, p, &label); if (r < 0) { return ObjectStore::write_meta(key, value); } label.meta[key] = value; r = _write_bdev_label(cct, p, label); ceph_assert(r == 0); return ObjectStore::write_meta(key, value); } int BlueStore::read_meta(const std::string& key, std::string *value) { bluestore_bdev_label_t label; string p = path + "/block"; int r = _read_bdev_label(cct, p, &label); if (r < 0) { return ObjectStore::read_meta(key, value); } auto i = label.meta.find(key); if (i == label.meta.end()) { return ObjectStore::read_meta(key, value); } *value = i->second; return 0; } void BlueStore::_init_logger() { PerfCountersBuilder b(cct, "bluestore", l_bluestore_first, l_bluestore_last); b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat", "Average kv_thread flush latency", "fl_l", PerfCountersBuilder::PRIO_INTERESTING); b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat", "Average kv_thread commit latency"); b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat", "Average kv_sync thread latency", "ks_l", PerfCountersBuilder::PRIO_INTERESTING); b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat", "Average kv_finalize thread latency", "kf_l", PerfCountersBuilder::PRIO_INTERESTING); b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat", "Average prepare state latency"); b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat", "Average aio_wait state latency", "io_l", PerfCountersBuilder::PRIO_INTERESTING); b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat", "Average io_done state latency"); b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat", "Average kv_queued state latency"); b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat", "Average kv_commiting state latency"); b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat", "Average kv_done state latency"); b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat", "Average deferred_queued state latency"); b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat", "Average aio_wait state latency"); b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat", "Average cleanup state latency"); b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat", "Average finishing state latency"); b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat", "Average done state latency"); b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat", "Average submit throttle latency", "th_l", PerfCountersBuilder::PRIO_CRITICAL); b.add_time_avg(l_bluestore_submit_lat, "submit_lat", "Average submit latency", "s_l", PerfCountersBuilder::PRIO_CRITICAL); b.add_time_avg(l_bluestore_commit_lat, "commit_lat", "Average commit latency", "c_l", PerfCountersBuilder::PRIO_CRITICAL); b.add_time_avg(l_bluestore_read_lat, "read_lat", "Average read latency", "r_l", PerfCountersBuilder::PRIO_CRITICAL); b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat", "Average read onode metadata latency"); b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat", "Average read latency"); b.add_time_avg(l_bluestore_compress_lat, "compress_lat", "Average compress latency"); b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat", "Average decompress latency"); b.add_time_avg(l_bluestore_csum_lat, "csum_lat", "Average checksum latency"); b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count", "Sum for beneficial compress ops"); b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count", "Sum for compress ops rejected due to low net gain of space"); b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes", "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops", "Sum for deferred write op"); b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes", "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops", "Sum for write penalty read ops"); b.add_u64(l_bluestore_allocated, "bluestore_allocated", "Sum for allocated bytes"); b.add_u64(l_bluestore_stored, "bluestore_stored", "Sum for stored bytes"); b.add_u64(l_bluestore_compressed, "bluestore_compressed", "Sum for stored compressed bytes", "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated", "Sum for bytes allocated for compressed data", "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original", "Sum for original bytes that were compressed", "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); b.add_u64(l_bluestore_onodes, "bluestore_onodes", "Number of onodes in cache"); b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes", "Number of pinned onodes in cache"); b.add_u64_counter(l_bluestore_onode_hits, "onode_hits", "Count of onode cache lookup hits", "o_ht", PerfCountersBuilder::PRIO_USEFUL); b.add_u64_counter(l_bluestore_onode_misses, "onode_misses", "Count of onode cache lookup misses", "o_ms", PerfCountersBuilder::PRIO_USEFUL); b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits", "Sum for onode-shard lookups hit in the cache"); b.add_u64_counter(l_bluestore_onode_shard_misses, "bluestore_onode_shard_misses", "Sum for onode-shard lookups missed in the cache"); b.add_u64(l_bluestore_extents, "bluestore_extents", "Number of extents in cache"); b.add_u64(l_bluestore_blobs, "bluestore_blobs", "Number of blobs in cache"); b.add_u64(l_bluestore_buffers, "bluestore_buffers", "Number of buffers in cache"); b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes", "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes", "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes", "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big", "Large aligned writes into fresh blobs"); b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes", "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs", "Large aligned writes into fresh blobs (blobs)"); b.add_u64_counter(l_bluestore_write_big_deferred, "bluestore_write_big_deferred", "Big overwrites using deferred"); b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small", "Small writes into existing or sparse small blobs"); b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes", "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_write_small_unused, "bluestore_write_small_unused", "Small writes into unused portion of existing blob"); b.add_u64_counter(l_bluestore_write_deferred, "bluestore_write_deferred", "Total deferred writes submitted"); b.add_u64_counter(l_bluestore_write_deferred_bytes, "bluestore_write_deferred_bytes", "Total bytes submitted as deferred writes"); b.add_u64_counter(l_bluestore_write_small_pre_read, "bluestore_write_small_pre_read", "Small writes that required we read some data (possibly " "cached) to fill out the block"); b.add_u64_counter(l_bluestore_write_new, "bluestore_write_new", "Write into new blob"); b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed"); b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard", "Onode extent map reshard events"); b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split", "Sum for blob splitting due to resharding"); b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress", "Sum for extents that have been removed due to compression"); b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged", "Sum for extents that have been merged due to garbage " "collection"); b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio", "Read EIO errors propagated to high level callers"); b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries", "Read operations that required at least one retry due to failed checksum validation"); b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros", "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000"); b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat", "Average omap iterator seek_to_first call latency"); b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat", "Average omap iterator upper_bound call latency"); b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat", "Average omap iterator lower_bound call latency"); b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat", "Average omap iterator next call latency"); b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat", "Average omap get_keys call latency"); b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat", "Average omap get_values call latency"); b.add_time_avg(l_bluestore_clist_lat, "clist_lat", "Average collection listing latency"); b.add_time_avg(l_bluestore_remove_lat, "remove_lat", "Average removal latency"); logger = b.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); } int BlueStore::_reload_logger() { struct store_statfs_t store_statfs; int r = statfs(&store_statfs); if (r >= 0) { logger->set(l_bluestore_allocated, store_statfs.allocated); logger->set(l_bluestore_stored, store_statfs.data_stored); logger->set(l_bluestore_compressed, store_statfs.data_compressed); logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated); logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original); } return r; } void BlueStore::_shutdown_logger() { cct->get_perfcounters_collection()->remove(logger); delete logger; } int BlueStore::get_block_device_fsid(CephContext* cct, const string& path, uuid_d *fsid) { bluestore_bdev_label_t label; int r = _read_bdev_label(cct, path, &label); if (r < 0) return r; *fsid = label.osd_uuid; return 0; } int BlueStore::_open_path() { // sanity check(s) ceph_assert(path_fd < 0); path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC)); if (path_fd < 0) { int r = -errno; derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r) << dendl; return r; } return 0; } void BlueStore::_close_path() { VOID_TEMP_FAILURE_RETRY(::close(path_fd)); path_fd = -1; } int BlueStore::_write_bdev_label(CephContext *cct, string path, bluestore_bdev_label_t label) { dout(10) << __func__ << " path " << path << " label " << label << dendl; bufferlist bl; encode(label, bl); uint32_t crc = bl.crc32c(-1); encode(crc, bl); ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE); bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length()); z.zero(); bl.append(std::move(z)); int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT)); if (fd < 0) { fd = -errno; derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd) << dendl; return fd; } bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX); int r = bl.write_fd(fd); if (r < 0) { derr << __func__ << " failed to write to " << path << ": " << cpp_strerror(r) << dendl; goto out; } r = ::fsync(fd); if (r < 0) { derr << __func__ << " failed to fsync " << path << ": " << cpp_strerror(r) << dendl; } out: VOID_TEMP_FAILURE_RETRY(::close(fd)); return r; } int BlueStore::_read_bdev_label(CephContext* cct, string path, bluestore_bdev_label_t *label) { dout(10) << __func__ << dendl; int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC)); if (fd < 0) { fd = -errno; derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd) << dendl; return fd; } bufferlist bl; int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE); VOID_TEMP_FAILURE_RETRY(::close(fd)); if (r < 0) { derr << __func__ << " failed to read from " << path << ": " << cpp_strerror(r) << dendl; return r; } uint32_t crc, expected_crc; auto p = bl.cbegin(); try { decode(*label, p); bufferlist t; t.substr_of(bl, 0, p.get_off()); crc = t.crc32c(-1); decode(expected_crc, p); } catch (ceph::buffer::error& e) { dout(2) << __func__ << " unable to decode label at offset " << p.get_off() << ": " << e.what() << dendl; return -ENOENT; } if (crc != expected_crc) { derr << __func__ << " bad crc on label, expected " << expected_crc << " != actual " << crc << dendl; return -EIO; } dout(10) << __func__ << " got " << *label << dendl; return 0; } int BlueStore::_check_or_set_bdev_label( string path, uint64_t size, string desc, bool create) { bluestore_bdev_label_t label; if (create) { label.osd_uuid = fsid; label.size = size; label.btime = ceph_clock_now(); label.description = desc; int r = _write_bdev_label(cct, path, label); if (r < 0) return r; } else { int r = _read_bdev_label(cct, path, &label); if (r < 0) return r; if (cct->_conf->bluestore_debug_permit_any_bdev_label) { dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid << " and fsid " << fsid << " check bypassed" << dendl; } else if (label.osd_uuid != fsid) { derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid << " does not match our fsid " << fsid << dendl; return -EIO; } } return 0; } void BlueStore::_set_alloc_sizes(void) { max_alloc_size = cct->_conf->bluestore_max_alloc_size; if (cct->_conf->bluestore_prefer_deferred_size) { prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size; } else { ceph_assert(bdev); if (_use_rotational_settings()) { prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd; } else { prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd; } } if (cct->_conf->bluestore_deferred_batch_ops) { deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops; } else { ceph_assert(bdev); if (_use_rotational_settings()) { deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd; } else { deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd; } } dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size << std::dec << " order " << (int)min_alloc_size_order << " max_alloc_size 0x" << std::hex << max_alloc_size << " prefer_deferred_size 0x" << prefer_deferred_size << std::dec << " deferred_batch_ops " << deferred_batch_ops << dendl; } int BlueStore::_open_bdev(bool create) { ceph_assert(bdev == NULL); string p = path + "/block"; bdev = BlockDevice::create(cct, p, aio_cb, static_cast(this), discard_cb, static_cast(this)); int r = bdev->open(p); if (r < 0) goto fail; if (create && cct->_conf->bdev_enable_discard) { bdev->discard(0, bdev->get_size()); } if (bdev->supported_bdev_label()) { r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create); if (r < 0) goto fail_close; } // initialize global block parameters block_size = bdev->get_block_size(); block_mask = ~(block_size - 1); block_size_order = ctz(block_size); ceph_assert(block_size == 1u << block_size_order); _set_max_defer_interval(); // and set cache_size based on device type r = _set_cache_sizes(); if (r < 0) { goto fail_close; } if (bdev->is_smr()) { freelist_type = "zoned"; } return 0; fail_close: bdev->close(); fail: delete bdev; bdev = NULL; return r; } void BlueStore::_validate_bdev() { ceph_assert(bdev); uint64_t dev_size = bdev->get_size(); ceph_assert(dev_size > _get_ondisk_reserved()); } void BlueStore::_close_bdev() { ceph_assert(bdev); bdev->close(); delete bdev; bdev = NULL; } int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only) { int r; ceph_assert(fm == NULL); fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC); ceph_assert(fm); if (t) { // create mode. initialize freespace dout(20) << __func__ << " initializing freespace" << dendl; { bufferlist bl; bl.append(freelist_type); t->set(PREFIX_SUPER, "freelist_type", bl); } // being able to allocate in units less than bdev block size // seems to be a bad idea. ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size); uint64_t alloc_size = min_alloc_size; if (bdev->is_smr()) { alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size); } fm->create(bdev->get_size(), alloc_size, t); // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on // bluefs doing that itself. auto reserved = _get_ondisk_reserved(); fm->allocate(0, reserved, t); if (cct->_conf->bluestore_debug_prefill > 0) { uint64_t end = bdev->get_size() - reserved; dout(1) << __func__ << " pre-fragmenting freespace, using " << cct->_conf->bluestore_debug_prefill << " with max free extent " << cct->_conf->bluestore_debug_prefragment_max << dendl; uint64_t start = p2roundup(reserved, min_alloc_size); uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size; float r = cct->_conf->bluestore_debug_prefill; r /= 1.0 - r; bool stop = false; while (!stop && start < end) { uint64_t l = (rand() % max_b + 1) * min_alloc_size; if (start + l > end) { l = end - start; l = p2align(l, min_alloc_size); } ceph_assert(start + l <= end); uint64_t u = 1 + (uint64_t)(r * (double)l); u = p2roundup(u, min_alloc_size); if (start + l + u > end) { u = end - (start + l); // trim to align so we don't overflow again u = p2align(u, min_alloc_size); stop = true; } ceph_assert(start + l + u <= end); dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l << " use 0x" << u << std::dec << dendl; if (u == 0) { // break if u has been trimmed to nothing break; } fm->allocate(start + l, u, t); start += l + u; } } r = _write_out_fm_meta(0); ceph_assert(r == 0); } else { r = fm->init(db, read_only, [&](const std::string& key, std::string* result) { return read_meta(key, result); }); if (r < 0) { derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl; delete fm; fm = NULL; return r; } } // if space size tracked by free list manager is that higher than actual // dev size one can hit out-of-space allocation which will result // in data loss and/or assertions // Probably user altered the device size somehow. // The only fix for now is to redeploy OSD. if (fm->get_size() >= bdev->get_size() + min_alloc_size) { ostringstream ss; ss << "slow device size mismatch detected, " << " fm size(" << fm->get_size() << ") > slow device size(" << bdev->get_size() << "), Please stop using this OSD as it might cause data loss."; _set_disk_size_mismatch_alert(ss.str()); } return 0; } void BlueStore::_close_fm() { dout(10) << __func__ << dendl; ceph_assert(fm); fm->shutdown(); delete fm; fm = NULL; } int BlueStore::_write_out_fm_meta(uint64_t target_size) { int r = 0; string p = path + "/block"; std::vector> fm_meta; fm->get_meta(target_size, &fm_meta); for (auto& m : fm_meta) { r = write_meta(m.first, m.second); ceph_assert(r == 0); } return r; } int BlueStore::_create_alloc() { ceph_assert(shared_alloc.a == NULL); ceph_assert(bdev->get_size()); uint64_t alloc_size = min_alloc_size; if (bdev->is_smr()) { int r = _zoned_check_config_settings(); if (r < 0) return r; alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size); } shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator, bdev->get_size(), alloc_size, "block")); if (!shared_alloc.a) { lderr(cct) << __func__ << "Failed to create allocator:: " << cct->_conf->bluestore_allocator << dendl; return -EINVAL; } return 0; } int BlueStore::_init_alloc() { int r = _create_alloc(); if (r < 0) { return r; } ceph_assert(shared_alloc.a != NULL); if (bdev->is_smr()) { shared_alloc.a->zoned_set_zone_states(fm->get_zone_states(db)); } uint64_t num = 0, bytes = 0; dout(1) << __func__ << " opening allocation metadata" << dendl; // initialize from freelist fm->enumerate_reset(); uint64_t offset, length; while (fm->enumerate_next(db, &offset, &length)) { shared_alloc.a->init_add_free(offset, length); ++num; bytes += length; } fm->enumerate_reset(); dout(1) << __func__ << " loaded " << byte_u_t(bytes) << " in " << num << " extents" << std::hex << ", allocator type " << shared_alloc.a->get_type() << ", capacity 0x" << shared_alloc.a->get_capacity() << ", block size 0x" << shared_alloc.a->get_block_size() << ", free 0x" << shared_alloc.a->get_free() << ", fragmentation " << shared_alloc.a->get_fragmentation() << std::dec << dendl; return 0; } void BlueStore::_close_alloc() { ceph_assert(bdev); bdev->discard_drain(); ceph_assert(shared_alloc.a); shared_alloc.a->shutdown(); delete shared_alloc.a; shared_alloc.reset(); } int BlueStore::_open_fsid(bool create) { ceph_assert(fsid_fd < 0); int flags = O_RDWR|O_CLOEXEC; if (create) flags |= O_CREAT; fsid_fd = ::openat(path_fd, "fsid", flags, 0644); if (fsid_fd < 0) { int err = -errno; derr << __func__ << " " << cpp_strerror(err) << dendl; return err; } return 0; } int BlueStore::_read_fsid(uuid_d *uuid) { char fsid_str[40]; memset(fsid_str, 0, sizeof(fsid_str)); int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str)); if (ret < 0) { derr << __func__ << " failed: " << cpp_strerror(ret) << dendl; return ret; } if (ret > 36) fsid_str[36] = 0; else fsid_str[ret] = 0; if (!uuid->parse(fsid_str)) { derr << __func__ << " unparsable uuid " << fsid_str << dendl; return -EINVAL; } return 0; } int BlueStore::_write_fsid() { int r = ::ftruncate(fsid_fd, 0); if (r < 0) { r = -errno; derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl; return r; } string str = stringify(fsid) + "\n"; r = safe_write(fsid_fd, str.c_str(), str.length()); if (r < 0) { derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl; return r; } r = ::fsync(fsid_fd); if (r < 0) { r = -errno; derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl; return r; } return 0; } void BlueStore::_close_fsid() { VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); fsid_fd = -1; } int BlueStore::_lock_fsid() { struct flock l; memset(&l, 0, sizeof(l)); l.l_type = F_WRLCK; l.l_whence = SEEK_SET; int r = ::fcntl(fsid_fd, F_SETLK, &l); if (r < 0) { int err = errno; derr << __func__ << " failed to lock " << path << "/fsid" << " (is another ceph-osd still running?)" << cpp_strerror(err) << dendl; return -err; } return 0; } bool BlueStore::is_rotational() { if (bdev) { return bdev->is_rotational(); } bool rotational = true; int r = _open_path(); if (r < 0) goto out; r = _open_fsid(false); if (r < 0) goto out_path; r = _read_fsid(&fsid); if (r < 0) goto out_fsid; r = _lock_fsid(); if (r < 0) goto out_fsid; r = _open_bdev(false); if (r < 0) goto out_fsid; rotational = bdev->is_rotational(); _close_bdev(); out_fsid: _close_fsid(); out_path: _close_path(); out: return rotational; } bool BlueStore::is_journal_rotational() { if (!bluefs) { dout(5) << __func__ << " bluefs disabled, default to store media type" << dendl; return is_rotational(); } dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl; return bluefs->wal_is_rotational(); } bool BlueStore::_use_rotational_settings() { if (cct->_conf->bluestore_debug_enforce_settings == "hdd") { return true; } if (cct->_conf->bluestore_debug_enforce_settings == "ssd") { return false; } return bdev->is_rotational(); } bool BlueStore::test_mount_in_use() { // most error conditions mean the mount is not in use (e.g., because // it doesn't exist). only if we fail to lock do we conclude it is // in use. bool ret = false; int r = _open_path(); if (r < 0) return false; r = _open_fsid(false); if (r < 0) goto out_path; r = _lock_fsid(); if (r < 0) ret = true; // if we can't lock, it is in use _close_fsid(); out_path: _close_path(); return ret; } int BlueStore::_minimal_open_bluefs(bool create) { int r; bluefs = new BlueFS(cct); string bfn; struct stat st; bfn = path + "/block.db"; if (::stat(bfn.c_str(), &st) == 0) { r = bluefs->add_block_device( BlueFS::BDEV_DB, bfn, create && cct->_conf->bdev_enable_discard, SUPER_RESERVED); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; goto free_bluefs; } if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) { r = _check_or_set_bdev_label( bfn, bluefs->get_block_device_size(BlueFS::BDEV_DB), "bluefs db", create); if (r < 0) { derr << __func__ << " check block device(" << bfn << ") label returned: " << cpp_strerror(r) << dendl; goto free_bluefs; } } bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; bluefs_layout.dedicated_db = true; } else { r = -errno; if (::lstat(bfn.c_str(), &st) == -1) { r = 0; bluefs_layout.shared_bdev = BlueFS::BDEV_DB; } else { derr << __func__ << " " << bfn << " symlink exists but target unusable: " << cpp_strerror(r) << dendl; goto free_bluefs; } } // shared device bfn = path + "/block"; // never trim here r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false, 0, // no need to provide valid 'reserved' for shared dev &shared_alloc); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; goto free_bluefs; } bfn = path + "/block.wal"; if (::stat(bfn.c_str(), &st) == 0) { r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, create && cct->_conf->bdev_enable_discard, BDEV_LABEL_BLOCK_SIZE); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; goto free_bluefs; } if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) { r = _check_or_set_bdev_label( bfn, bluefs->get_block_device_size(BlueFS::BDEV_WAL), "bluefs wal", create); if (r < 0) { derr << __func__ << " check block device(" << bfn << ") label returned: " << cpp_strerror(r) << dendl; goto free_bluefs; } } bluefs_layout.dedicated_wal = true; } else { r = 0; if (::lstat(bfn.c_str(), &st) != -1) { r = -errno; derr << __func__ << " " << bfn << " symlink exists but target unusable: " << cpp_strerror(r) << dendl; goto free_bluefs; } } return 0; free_bluefs: ceph_assert(bluefs); delete bluefs; bluefs = NULL; return r; } int BlueStore::_open_bluefs(bool create, bool read_only) { int r = _minimal_open_bluefs(create); if (r < 0) { return r; } BlueFSVolumeSelector* vselector = nullptr; if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) { string options = cct->_conf->bluestore_rocksdb_options; string options_annex = cct->_conf->bluestore_rocksdb_options_annex; if (!options_annex.empty()) { if (!options.empty() && *options.rbegin() != ',') { options += ','; } options += options_annex; } rocksdb::Options rocks_opts; r = RocksDBStore::ParseOptionsFromStringStatic( cct, options, rocks_opts, nullptr); if (r < 0) { return r; } if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") { vselector = new FitToFastVolumeSelector( bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100); } else { double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor; vselector = new RocksDBBlueFSVolumeSelector( bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100, 1024 * 1024 * 1024, //FIXME: set expected l0 size here rocks_opts.max_bytes_for_level_base, rocks_opts.max_bytes_for_level_multiplier, reserved_factor, cct->_conf->bluestore_volume_selection_reserved, cct->_conf->bluestore_volume_selection_policy == "use_some_extra"); } } if (create) { bluefs->mkfs(fsid, bluefs_layout); } bluefs->set_volume_selector(vselector); r = bluefs->mount(); if (r < 0) { derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl; } ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0); return r; } void BlueStore::_close_bluefs(bool cold_close) { bluefs->umount(cold_close); _minimal_close_bluefs(); } void BlueStore::_minimal_close_bluefs() { delete bluefs; bluefs = NULL; } int BlueStore::_is_bluefs(bool create, bool* ret) { if (create) { *ret = cct->_conf->bluestore_bluefs; } else { string s; int r = read_meta("bluefs", &s); if (r < 0) { derr << __func__ << " unable to read 'bluefs' meta" << dendl; return -EIO; } if (s == "1") { *ret = true; } else if (s == "0") { *ret = false; } else { derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" << dendl; return -EIO; } } return 0; } /* * opens both DB and dependant super_meta, FreelistManager and allocator * in the proper order */ int BlueStore::_open_db_and_around(bool read_only, bool to_repair) { dout(0) << __func__ << " read-only:" << read_only << " repair:" << to_repair << dendl; { string type; int r = read_meta("type", &type); if (r < 0) { derr << __func__ << " failed to load os-type: " << cpp_strerror(r) << dendl; return r; } if (type != "bluestore") { derr << __func__ << " expected bluestore, but type is " << type << dendl; return -EIO; } } int r = _open_path(); if (r < 0) return r; r = _open_fsid(false); if (r < 0) goto out_path; r = _read_fsid(&fsid); if (r < 0) goto out_fsid; r = _lock_fsid(); if (r < 0) goto out_fsid; r = _open_bdev(false); if (r < 0) goto out_fsid; // open in read-only first to read FM list and init allocator // as they might be needed for some BlueFS procedures r = _open_db(false, false, true); if (r < 0) goto out_bdev; r = _open_super_meta(); if (r < 0) { goto out_db; } r = _open_fm(nullptr, true); if (r < 0) goto out_db; r = _init_alloc(); if (r < 0) goto out_fm; // Re-open in the proper mode(s). // Can't simply bypass second open for read-only mode as we need to // load allocated extents from bluefs into allocator. // And now it's time to do that // _close_db(true); r = _open_db(false, to_repair, read_only); if (r < 0) { goto out_alloc; } return 0; out_alloc: _close_alloc(); out_fm: _close_fm(); out_db: _close_db(read_only); out_bdev: _close_bdev(); out_fsid: _close_fsid(); out_path: _close_path(); return r; } void BlueStore::_close_db_and_around(bool read_only) { _close_db(read_only); _close_fm(); _close_alloc(); _close_bdev(); _close_fsid(); _close_path(); } int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair) { _kv_only = true; int r = _open_db_and_around(false, to_repair); if (r == 0) { *pdb = db; } else { *pdb = nullptr; } return r; } int BlueStore::close_db_environment() { _close_db_and_around(false); return 0; } /* gets access to bluefs supporting RocksDB */ BlueFS* BlueStore::get_bluefs() { return bluefs; } int BlueStore::_prepare_db_environment(bool create, bool read_only, std::string* _fn, std::string* _kv_backend) { int r; ceph_assert(!db); std::string& fn=*_fn; std::string& kv_backend=*_kv_backend; fn = path + "/db"; std::shared_ptr merge_op(new Int64ArrayMergeOperator); if (create) { kv_backend = cct->_conf->bluestore_kvbackend; } else { r = read_meta("kv_backend", &kv_backend); if (r < 0) { derr << __func__ << " unable to read 'kv_backend' meta" << dendl; return -EIO; } } dout(10) << __func__ << " kv_backend = " << kv_backend << dendl; bool do_bluefs; r = _is_bluefs(create, &do_bluefs); if (r < 0) { return r; } dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl; map kv_options; // force separate wal dir for all new deployments. kv_options["separate_wal_dir"] = 1; rocksdb::Env *env = NULL; if (do_bluefs) { dout(10) << __func__ << " initializing bluefs" << dendl; if (kv_backend != "rocksdb") { derr << " backend must be rocksdb to use bluefs" << dendl; return -EINVAL; } r = _open_bluefs(create, read_only); if (r < 0) { return r; } if (cct->_conf->bluestore_bluefs_env_mirror) { rocksdb::Env* a = new BlueRocksEnv(bluefs); rocksdb::Env* b = rocksdb::Env::Default(); if (create) { string cmd = "rm -rf " + path + "/db " + path + "/db.slow " + path + "/db.wal"; int r = system(cmd.c_str()); (void)r; } env = new rocksdb::EnvMirror(b, a, false, true); } else { env = new BlueRocksEnv(bluefs); // simplify the dir names, too, as "seen" by rocksdb fn = "db"; } BlueFSVolumeSelector::paths paths; bluefs->get_vselector_paths(fn, paths); { ostringstream db_paths; bool first = true; for (auto& p : paths) { if (!first) { db_paths << " "; } first = false; db_paths << p.first << "," << p.second; } kv_options["db_paths"] = db_paths.str(); dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl; } if (create) { for (auto& p : paths) { env->CreateDir(p.first); } // Selectors don't provide wal path so far hence create explicitly env->CreateDir(fn + ".wal"); } else { std::vector res; // check for dir presence auto r = env->GetChildren(fn+".wal", &res); if (r.IsNotFound()) { kv_options.erase("separate_wal_dir"); } } } else { string walfn = path + "/db.wal"; if (create) { int r = ::mkdir(fn.c_str(), 0755); if (r < 0) r = -errno; if (r < 0 && r != -EEXIST) { derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r) << dendl; return r; } // wal_dir, too! r = ::mkdir(walfn.c_str(), 0755); if (r < 0) r = -errno; if (r < 0 && r != -EEXIST) { derr << __func__ << " failed to create " << walfn << ": " << cpp_strerror(r) << dendl; return r; } } else { struct stat st; r = ::stat(walfn.c_str(), &st); if (r < 0 && errno == ENOENT) { kv_options.erase("separate_wal_dir"); } } } db = KeyValueDB::create(cct, kv_backend, fn, kv_options, static_cast(env)); if (!db) { derr << __func__ << " error creating db" << dendl; if (bluefs) { _close_bluefs(read_only); } // delete env manually here since we can't depend on db to do this // under this case delete env; env = NULL; return -EIO; } FreelistManager::setup_merge_operators(db, freelist_type); db->set_merge_operator(PREFIX_STAT, merge_op); db->set_cache_size(cache_kv_ratio * cache_size); return 0; } int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only) { int r; ceph_assert(!(create && read_only)); string options; string options_annex; stringstream err; string kv_dir_fn; string kv_backend; std::string sharding_def; r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend); if (r < 0) { derr << __func__ << " failed to prepare db environment: " << err.str() << dendl; return -EIO; } if (kv_backend == "rocksdb") { options = cct->_conf->bluestore_rocksdb_options; options_annex = cct->_conf->bluestore_rocksdb_options_annex; if (!options_annex.empty()) { if (!options.empty() && *options.rbegin() != ',') { options += ','; } options += options_annex; } if (cct->_conf.get_val("bluestore_rocksdb_cf")) { sharding_def = cct->_conf.get_val("bluestore_rocksdb_cfs"); } } db->init(options); if (to_repair_db) return 0; if (create) { r = db->create_and_open(err, sharding_def); } else { // we pass in cf list here, but it is only used if the db already has // column families created. r = read_only ? db->open_read_only(err, sharding_def) : db->open(err, sharding_def); } if (r) { derr << __func__ << " erroring opening db: " << err.str() << dendl; _close_db(read_only); return -EIO; } dout(1) << __func__ << " opened " << kv_backend << " path " << kv_dir_fn << " options " << options << dendl; return 0; } void BlueStore::_close_db(bool cold_close) { ceph_assert(db); delete db; db = NULL; if (bluefs) { _close_bluefs(cold_close); } } void BlueStore::_dump_alloc_on_failure() { auto dump_interval = cct->_conf->bluestore_bluefs_alloc_failure_dump_interval; if (dump_interval > 0 && next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) { shared_alloc.a->dump(); next_dump_on_bluefs_alloc_failure = ceph_clock_now(); next_dump_on_bluefs_alloc_failure += dump_interval; } } int BlueStore::_open_collections() { dout(10) << __func__ << dendl; collections_had_errors = false; ceph_assert(coll_map.empty()); KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); for (it->upper_bound(string()); it->valid(); it->next()) { coll_t cid; if (cid.parse(it->key())) { auto c = ceph::make_ref( this, onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())], buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())], cid); bufferlist bl = it->value(); auto p = bl.cbegin(); try { decode(c->cnode, p); } catch (ceph::buffer::error& e) { derr << __func__ << " failed to decode cnode, key:" << pretty_binary_string(it->key()) << dendl; return -EIO; } dout(20) << __func__ << " opened " << cid << " " << c << " " << c->cnode << dendl; _osr_attach(c.get()); coll_map[cid] = c; } else { derr << __func__ << " unrecognized collection " << it->key() << dendl; collections_had_errors = true; } } return 0; } void BlueStore::_fsck_collections(int64_t* errors) { if (collections_had_errors) { dout(10) << __func__ << dendl; KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE); for (it->upper_bound(string()); it->valid(); it->next()) { coll_t cid; if (!cid.parse(it->key())) { derr << __func__ << " unrecognized collection " << it->key() << dendl; if (errors) { (*errors)++; } } } } } void BlueStore::_set_per_pool_omap() { per_pool_omap = OMAP_BULK; bufferlist bl; db->get(PREFIX_SUPER, "per_pool_omap", &bl); if (bl.length()) { auto s = bl.to_str(); if (s == stringify(OMAP_PER_POOL)) { per_pool_omap = OMAP_PER_POOL; } else if (s == stringify(OMAP_PER_PG)) { per_pool_omap = OMAP_PER_PG; } else { ceph_assert(s == stringify(OMAP_BULK)); } dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl; } else { dout(10) << __func__ << " per_pool_omap not present" << dendl; } _check_no_per_pg_or_pool_omap_alert(); } void BlueStore::_open_statfs() { osd_pools.clear(); vstatfs.reset(); bufferlist bl; int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl); if (r >= 0) { per_pool_stat_collection = false; if (size_t(bl.length()) >= sizeof(vstatfs.values)) { auto it = bl.cbegin(); vstatfs.decode(it); dout(10) << __func__ << " store_statfs is found" << dendl; } else { dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl; } _check_legacy_statfs_alert(); } else { per_pool_stat_collection = true; dout(10) << __func__ << " per-pool statfs is enabled" << dendl; KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE); for (it->upper_bound(string()); it->valid(); it->next()) { uint64_t pool_id; int r = get_key_pool_stat(it->key(), &pool_id); ceph_assert(r == 0); bufferlist bl; bl = it->value(); auto p = bl.cbegin(); auto& st = osd_pools[pool_id]; try { st.decode(p); vstatfs += st; dout(30) << __func__ << " pool " << pool_id << " statfs " << st << dendl; } catch (ceph::buffer::error& e) { derr << __func__ << " failed to decode pool stats, key:" << pretty_binary_string(it->key()) << dendl; } } } dout(30) << __func__ << " statfs " << vstatfs << dendl; } int BlueStore::_setup_block_symlink_or_file( string name, string epath, uint64_t size, bool create) { dout(20) << __func__ << " name " << name << " path " << epath << " size " << size << " create=" << (int)create << dendl; int r = 0; int flags = O_RDWR|O_CLOEXEC; if (create) flags |= O_CREAT; if (epath.length()) { r = ::symlinkat(epath.c_str(), path_fd, name.c_str()); if (r < 0) { r = -errno; derr << __func__ << " failed to create " << name << " symlink to " << epath << ": " << cpp_strerror(r) << dendl; return r; } if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) { int fd = ::openat(path_fd, epath.c_str(), flags, 0644); if (fd < 0) { r = -errno; derr << __func__ << " failed to open " << epath << " file: " << cpp_strerror(r) << dendl; return r; } // write the Transport ID of the NVMe device // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0" // where "0000:02:00.0" is the selector of a PCI device, see // the first column of "lspci -mm -n -D" string trid{"trtype:PCIe "}; trid += "traddr:"; trid += epath.substr(strlen(SPDK_PREFIX)); r = ::write(fd, trid.c_str(), trid.size()); ceph_assert(r == static_cast(trid.size())); dout(1) << __func__ << " created " << name << " symlink to " << epath << dendl; VOID_TEMP_FAILURE_RETRY(::close(fd)); } } if (size) { int fd = ::openat(path_fd, name.c_str(), flags, 0644); if (fd >= 0) { // block file is present struct stat st; int r = ::fstat(fd, &st); if (r == 0 && S_ISREG(st.st_mode) && // if it is a regular file st.st_size == 0) { // and is 0 bytes r = ::ftruncate(fd, size); if (r < 0) { r = -errno; derr << __func__ << " failed to resize " << name << " file to " << size << ": " << cpp_strerror(r) << dendl; VOID_TEMP_FAILURE_RETRY(::close(fd)); return r; } if (cct->_conf->bluestore_block_preallocate_file) { r = ::ceph_posix_fallocate(fd, 0, size); if (r > 0) { derr << __func__ << " failed to prefallocate " << name << " file to " << size << ": " << cpp_strerror(r) << dendl; VOID_TEMP_FAILURE_RETRY(::close(fd)); return -r; } } dout(1) << __func__ << " resized " << name << " file to " << byte_u_t(size) << dendl; } VOID_TEMP_FAILURE_RETRY(::close(fd)); } else { int r = -errno; if (r != -ENOENT) { derr << __func__ << " failed to open " << name << " file: " << cpp_strerror(r) << dendl; return r; } } } return 0; } int BlueStore::mkfs() { dout(1) << __func__ << " path " << path << dendl; int r; uuid_d old_fsid; uint64_t reserved; if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) { derr << __func__ << " osd_max_object_size " << cct->_conf->osd_max_object_size << " > bluestore max " << OBJECT_MAX_SIZE << dendl; return -EINVAL; } { string done; r = read_meta("mkfs_done", &done); if (r == 0) { dout(1) << __func__ << " already created" << dendl; if (cct->_conf->bluestore_fsck_on_mkfs) { r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep); if (r < 0) { derr << __func__ << " fsck found fatal error: " << cpp_strerror(r) << dendl; return r; } if (r > 0) { derr << __func__ << " fsck found " << r << " errors" << dendl; r = -EIO; } } return r; // idempotent } } { string type; r = read_meta("type", &type); if (r == 0) { if (type != "bluestore") { derr << __func__ << " expected bluestore, but type is " << type << dendl; return -EIO; } } else { r = write_meta("type", "bluestore"); if (r < 0) return r; } } freelist_type = "bitmap"; r = _open_path(); if (r < 0) return r; r = _open_fsid(true); if (r < 0) goto out_path_fd; r = _lock_fsid(); if (r < 0) goto out_close_fsid; r = _read_fsid(&old_fsid); if (r < 0 || old_fsid.is_zero()) { if (fsid.is_zero()) { fsid.generate_random(); dout(1) << __func__ << " generated fsid " << fsid << dendl; } else { dout(1) << __func__ << " using provided fsid " << fsid << dendl; } // we'll write it later. } else { if (!fsid.is_zero() && fsid != old_fsid) { derr << __func__ << " on-disk fsid " << old_fsid << " != provided " << fsid << dendl; r = -EINVAL; goto out_close_fsid; } fsid = old_fsid; } r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path, cct->_conf->bluestore_block_size, cct->_conf->bluestore_block_create); if (r < 0) goto out_close_fsid; if (cct->_conf->bluestore_bluefs) { r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path, cct->_conf->bluestore_block_wal_size, cct->_conf->bluestore_block_wal_create); if (r < 0) goto out_close_fsid; r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path, cct->_conf->bluestore_block_db_size, cct->_conf->bluestore_block_db_create); if (r < 0) goto out_close_fsid; } r = _open_bdev(true); if (r < 0) goto out_close_fsid; // choose min_alloc_size if (cct->_conf->bluestore_min_alloc_size) { min_alloc_size = cct->_conf->bluestore_min_alloc_size; } else { ceph_assert(bdev); if (_use_rotational_settings()) { min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd; } else { min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd; } } _validate_bdev(); // make sure min_alloc_size is power of 2 aligned. if (!isp2(min_alloc_size)) { derr << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size << std::dec << " is not power of 2 aligned!" << dendl; r = -EINVAL; goto out_close_bdev; } r = _create_alloc(); if (r < 0) { goto out_close_bdev; } reserved = _get_ondisk_reserved(); shared_alloc.a->init_add_free(reserved, p2align(bdev->get_size(), min_alloc_size) - reserved); r = _open_db(true); if (r < 0) goto out_close_alloc; { KeyValueDB::Transaction t = db->get_transaction(); r = _open_fm(t, true); if (r < 0) goto out_close_db; { bufferlist bl; encode((uint64_t)0, bl); t->set(PREFIX_SUPER, "nid_max", bl); t->set(PREFIX_SUPER, "blobid_max", bl); } { bufferlist bl; encode((uint64_t)min_alloc_size, bl); t->set(PREFIX_SUPER, "min_alloc_size", bl); } { bufferlist bl; if (cct->_conf.get_val("bluestore_debug_legacy_omap")) { bl.append(stringify(OMAP_BULK)); } else { bl.append(stringify(OMAP_PER_PG)); } t->set(PREFIX_SUPER, "per_pool_omap", bl); } ondisk_format = latest_ondisk_format; _prepare_ondisk_format_super(t); db->submit_transaction_sync(t); } r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend); if (r < 0) goto out_close_fm; r = write_meta("bluefs", stringify(bluefs ? 1 : 0)); if (r < 0) goto out_close_fm; if (fsid != old_fsid) { r = _write_fsid(); if (r < 0) { derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl; goto out_close_fm; } } out_close_fm: _close_fm(); out_close_db: _close_db(false); out_close_alloc: _close_alloc(); out_close_bdev: _close_bdev(); out_close_fsid: _close_fsid(); out_path_fd: _close_path(); if (r == 0 && cct->_conf->bluestore_fsck_on_mkfs) { int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep); if (rc < 0) return rc; if (rc > 0) { derr << __func__ << " fsck found " << rc << " errors" << dendl; r = -EIO; } } if (r == 0) { // indicate success by writing the 'mkfs_done' file r = write_meta("mkfs_done", "yes"); } if (r < 0) { derr << __func__ << " failed, " << cpp_strerror(r) << dendl; } else { dout(0) << __func__ << " success" << dendl; } return r; } int BlueStore::add_new_bluefs_device(int id, const string& dev_path) { dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; int r; ceph_assert(path_fd < 0); ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); if (!cct->_conf->bluestore_bluefs) { derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; return -EIO; } r = _open_db_and_around(true); if (id == BlueFS::BDEV_NEWWAL) { string p = path + "/block.wal"; r = _setup_block_symlink_or_file("block.wal", dev_path, cct->_conf->bluestore_block_wal_size, true); ceph_assert(r == 0); r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p, cct->_conf->bdev_enable_discard, BDEV_LABEL_BLOCK_SIZE); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { r = _check_or_set_bdev_label( p, bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), "bluefs wal", true); ceph_assert(r == 0); } bluefs_layout.dedicated_wal = true; } else if (id == BlueFS::BDEV_NEWDB) { string p = path + "/block.db"; r = _setup_block_symlink_or_file("block.db", dev_path, cct->_conf->bluestore_block_db_size, true); ceph_assert(r == 0); r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p, cct->_conf->bdev_enable_discard, SUPER_RESERVED); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { r = _check_or_set_bdev_label( p, bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), "bluefs db", true); ceph_assert(r == 0); } bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; bluefs_layout.dedicated_db = true; } bluefs->umount(); bluefs->mount(); r = bluefs->prepare_new_device(id, bluefs_layout); ceph_assert(r == 0); if (r < 0) { derr << __func__ << " failed, " << cpp_strerror(r) << dendl; } else { dout(0) << __func__ << " success" << dendl; } _close_db_and_around(true); return r; } int BlueStore::migrate_to_existing_bluefs_device(const set& devs_source, int id) { dout(10) << __func__ << " id:" << id << dendl; ceph_assert(path_fd < 0); ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB); if (!cct->_conf->bluestore_bluefs) { derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; return -EIO; } int r = _open_db_and_around(true); uint64_t used_space = 0; for(auto src_id : devs_source) { used_space += bluefs->get_used(src_id); } uint64_t target_free = bluefs->get_free(id); if (target_free < used_space) { derr << __func__ << " can't migrate, free space at target: " << target_free << " is less than required space: " << used_space << dendl; r = -ENOSPC; goto shutdown; } if (devs_source.count(BlueFS::BDEV_DB)) { bluefs_layout.shared_bdev = BlueFS::BDEV_DB; bluefs_layout.dedicated_db = false; } if (devs_source.count(BlueFS::BDEV_WAL)) { bluefs_layout.dedicated_wal = false; } r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout); if (r < 0) { derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; goto shutdown; } if (devs_source.count(BlueFS::BDEV_DB)) { r = unlink(string(path + "/block.db").c_str()); ceph_assert(r == 0); } if (devs_source.count(BlueFS::BDEV_WAL)) { r = unlink(string(path + "/block.wal").c_str()); ceph_assert(r == 0); } shutdown: _close_db_and_around(true); return r; } int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, int id, const string& dev_path) { dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; int r; ceph_assert(path_fd < 0); ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); if (!cct->_conf->bluestore_bluefs) { derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; return -EIO; } r = _open_db_and_around(true); string link_db; string link_wal; if (devs_source.count(BlueFS::BDEV_DB) && bluefs_layout.shared_bdev != BlueFS::BDEV_DB) { link_db = path + "/block.db"; bluefs_layout.shared_bdev = BlueFS::BDEV_DB; bluefs_layout.dedicated_db = false; } if (devs_source.count(BlueFS::BDEV_WAL)) { link_wal = path + "/block.wal"; bluefs_layout.dedicated_wal = false; } size_t target_size; string target_name; if (id == BlueFS::BDEV_NEWWAL) { target_name = "block.wal"; target_size = cct->_conf->bluestore_block_wal_size; bluefs_layout.dedicated_wal = true; r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path, cct->_conf->bdev_enable_discard, BDEV_LABEL_BLOCK_SIZE); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { r = _check_or_set_bdev_label( dev_path, bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), "bluefs wal", true); ceph_assert(r == 0); } } else if (id == BlueFS::BDEV_NEWDB) { target_name = "block.db"; target_size = cct->_conf->bluestore_block_db_size; bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; bluefs_layout.dedicated_db = true; r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path, cct->_conf->bdev_enable_discard, SUPER_RESERVED); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { r = _check_or_set_bdev_label( dev_path, bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), "bluefs db", true); ceph_assert(r == 0); } } bluefs->umount(); bluefs->mount(); r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout); if (r < 0) { derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; goto shutdown; } if (!link_db.empty()) { r = unlink(link_db.c_str()); ceph_assert(r == 0); } if (!link_wal.empty()) { r = unlink(link_wal.c_str()); ceph_assert(r == 0); } r = _setup_block_symlink_or_file( target_name, dev_path, target_size, true); ceph_assert(r == 0); dout(0) << __func__ << " success" << dendl; shutdown: _close_db_and_around(true); return r; } string BlueStore::get_device_path(unsigned id) { string res; if (id < BlueFS::MAX_BDEV) { switch (id) { case BlueFS::BDEV_WAL: res = path + "/block.wal"; break; case BlueFS::BDEV_DB: if (id == bluefs_layout.shared_bdev) { res = path + "/block"; } else { res = path + "/block.db"; } break; case BlueFS::BDEV_SLOW: res = path + "/block"; break; } } return res; } int BlueStore::_set_bdev_label_size(const string& path, uint64_t size) { bluestore_bdev_label_t label; int r = _read_bdev_label(cct, path, &label); if (r < 0) { derr << "unable to read label for " << path << ": " << cpp_strerror(r) << dendl; } else { label.size = size; r = _write_bdev_label(cct, path, label); if (r < 0) { derr << "unable to write label for " << path << ": " << cpp_strerror(r) << dendl; } } return r; } int BlueStore::expand_devices(ostream& out) { int r = _open_db_and_around(true); ceph_assert(r == 0); bluefs->dump_block_extents(out); out << "Expanding DB/WAL..." << std::endl; for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) { if (devid == bluefs_layout.shared_bdev ) { continue; } uint64_t size = bluefs->get_block_device_size(devid); if (size == 0) { // no bdev continue; } out << devid <<" : expanding " << " to 0x" << size << std::dec << std::endl; string p = get_device_path(devid); const char* path = p.c_str(); if (path == nullptr) { derr << devid <<": can't find device path " << dendl; continue; } if (bluefs->bdev_support_label(devid)) { if (_set_bdev_label_size(p, size) >= 0) { out << devid << " : size label updated to " << size << std::endl; } } } uint64_t size0 = fm->get_size(); uint64_t size = bdev->get_size(); if (size0 < size) { out << bluefs_layout.shared_bdev << " : expanding " << " from 0x" << std::hex << size0 << " to 0x" << size << std::dec << std::endl; _write_out_fm_meta(size); if (bdev->supported_bdev_label()) { if (_set_bdev_label_size(path, size) >= 0) { out << bluefs_layout.shared_bdev << " : size label updated to " << size << std::endl; } } _close_db_and_around(true); // mount in read/write to sync expansion changes r = _mount(); ceph_assert(r == 0); umount(); } else { _close_db_and_around(true); } return r; } int BlueStore::dump_bluefs_sizes(ostream& out) { int r = _open_db_and_around(true); ceph_assert(r == 0); bluefs->dump_block_extents(out); _close_db_and_around(true); return r; } void BlueStore::set_cache_shards(unsigned num) { dout(10) << __func__ << " " << num << dendl; size_t oold = onode_cache_shards.size(); size_t bold = buffer_cache_shards.size(); ceph_assert(num >= oold && num >= bold); onode_cache_shards.resize(num); buffer_cache_shards.resize(num); for (unsigned i = oold; i < num; ++i) { onode_cache_shards[i] = OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type, logger); } for (unsigned i = bold; i < num; ++i) { buffer_cache_shards[i] = BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type, logger); } } int BlueStore::_mount() { dout(1) << __func__ << " path " << path << dendl; _kv_only = false; if (cct->_conf->bluestore_fsck_on_mount) { int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep); if (rc < 0) return rc; if (rc > 0) { derr << __func__ << " fsck found " << rc << " errors" << dendl; return -EIO; } } if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) { derr << __func__ << " osd_max_object_size " << cct->_conf->osd_max_object_size << " > bluestore max " << OBJECT_MAX_SIZE << dendl; return -EINVAL; } int r = _open_db_and_around(false); if (r < 0) { return r; } r = _upgrade_super(); if (r < 0) { goto out_db; } r = _open_collections(); if (r < 0) goto out_db; r = _reload_logger(); if (r < 0) goto out_coll; _kv_start(); if (bdev->is_smr()) { _zoned_cleaner_start(); } r = _deferred_replay(); if (r < 0) goto out_stop; mempool_thread.init(); if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) && cct->_conf->bluestore_fsck_quick_fix_on_mount == true) { auto was_per_pool_omap = per_pool_omap; dout(1) << __func__ << " quick-fix on mount" << dendl; _fsck_on_open(FSCK_SHALLOW, true); //reread statfs //FIXME minor: replace with actual open/close? _open_statfs(); _check_legacy_statfs_alert(); //set again as hopefully it has been fixed if (was_per_pool_omap != OMAP_PER_PG) { _set_per_pool_omap(); } } mounted = true; return 0; out_stop: if (bdev->is_smr()) { _zoned_cleaner_stop(); } _kv_stop(); out_coll: _shutdown_cache(); out_db: _close_db_and_around(false); return r; } int BlueStore::umount() { ceph_assert(_kv_only || mounted); dout(1) << __func__ << dendl; _osr_drain_all(); mounted = false; if (!_kv_only) { mempool_thread.shutdown(); if (bdev->is_smr()) { dout(20) << __func__ << " stopping zone cleaner thread" << dendl; _zoned_cleaner_stop(); } dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); _shutdown_cache(); dout(20) << __func__ << " closing" << dendl; } _close_db_and_around(false); if (cct->_conf->bluestore_fsck_on_umount) { int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep); if (rc < 0) return rc; if (rc > 0) { derr << __func__ << " fsck found " << rc << " errors" << dendl; return -EIO; } } return 0; } int BlueStore::cold_open() { return _open_db_and_around(true); } int BlueStore::cold_close() { _close_db_and_around(true); return 0; } // derr wrapper to limit enormous output and avoid log flooding. // Of limited use where such output is expected for now #define fsck_derr(err_cnt, threshold) \ if (err_cnt <= threshold) { \ bool need_skip_print = err_cnt == threshold; \ derr #define fsck_dendl \ dendl; \ if (need_skip_print) \ derr << "more error lines skipped..." << dendl; \ } int _fsck_sum_extents( const PExtentVector& extents, bool compressed, store_statfs_t& expected_statfs) { for (auto e : extents) { if (!e.is_valid()) continue; expected_statfs.allocated += e.length; if (compressed) { expected_statfs.data_compressed_allocated += e.length; } } return 0; } int BlueStore::_fsck_check_extents( std::string_view ctx_descr, const PExtentVector& extents, bool compressed, mempool_dynamic_bitset &used_blocks, uint64_t granularity, BlueStoreRepairer* repairer, store_statfs_t& expected_statfs, FSCKDepth depth) { dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl; int errors = 0; for (auto e : extents) { if (!e.is_valid()) continue; expected_statfs.allocated += e.length; if (compressed) { expected_statfs.data_compressed_allocated += e.length; } if (depth != FSCK_SHALLOW) { bool already = false; apply_for_bitset_range( e.offset, e.length, granularity, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { if (bs.test(pos)) { if (repairer) { repairer->note_misreference( pos * min_alloc_size, min_alloc_size, !already); } if (!already) { derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e << " or a subset is already allocated (misreferenced)" << dendl; ++errors; already = true; } } else bs.set(pos); }); if (e.end() > bdev->get_size()) { derr << "fsck error: " << ctx_descr << ", extent " << e << " past end of block device" << dendl; ++errors; } } } return errors; } void BlueStore::_fsck_check_pool_statfs( BlueStore::per_pool_statfs& expected_pool_statfs, int64_t& errors, int64_t& warnings, BlueStoreRepairer* repairer) { auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE); if (it) { for (it->lower_bound(string()); it->valid(); it->next()) { string key = it->key(); if (key == BLUESTORE_GLOBAL_STATFS_KEY) { if (repairer) { ++errors; repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY); derr << "fsck error: " << "legacy statfs record found, removing" << dendl; } continue; } uint64_t pool_id; if (get_key_pool_stat(key, &pool_id) < 0) { derr << "fsck error: bad key " << key << "in statfs namespece" << dendl; if (repairer) { repairer->remove_key(db, PREFIX_STAT, key); } ++errors; continue; } volatile_statfs vstatfs; bufferlist bl = it->value(); auto blp = bl.cbegin(); try { vstatfs.decode(blp); } catch (ceph::buffer::error& e) { derr << "fsck error: failed to decode Pool StatFS record" << pretty_binary_string(key) << dendl; if (repairer) { dout(20) << __func__ << " undecodable Pool StatFS record, key:'" << pretty_binary_string(key) << "', removing" << dendl; repairer->remove_key(db, PREFIX_STAT, key); } ++errors; vstatfs.reset(); } auto stat_it = expected_pool_statfs.find(pool_id); if (stat_it == expected_pool_statfs.end()) { if (vstatfs.is_empty()) { // we don't consider that as an error since empty pool statfs // are left in DB for now dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x" << std::hex << pool_id << std::dec << dendl; if (repairer) { // but we need to increment error count in case of repair // to have proper counters at the end // (as repairer increments recovery counter anyway). ++errors; } } else { derr << "fsck error: found stray Pool StatFS record for pool id 0x" << std::hex << pool_id << std::dec << dendl; ++errors; } if (repairer) { repairer->remove_key(db, PREFIX_STAT, key); } continue; } store_statfs_t statfs; vstatfs.publish(&statfs); if (!(stat_it->second == statfs)) { derr << "fsck error: actual " << statfs << " != expected " << stat_it->second << " for pool " << std::hex << pool_id << std::dec << dendl; if (repairer) { repairer->fix_statfs(db, key, stat_it->second); } ++errors; } expected_pool_statfs.erase(stat_it); } } // if (it) for (auto& s : expected_pool_statfs) { if (s.second.is_zero()) { // we might lack empty statfs recs in DB continue; } derr << "fsck error: missing Pool StatFS record for pool " << std::hex << s.first << std::dec << dendl; if (repairer) { string key; get_pool_stat_key(s.first, &key); repairer->fix_statfs(db, key, s.second); } ++errors; } if (!per_pool_stat_collection && repairer) { // by virtue of running this method, we correct the top-level // error of having global stats repairer->inc_repaired(); } } void BlueStore::_fsck_repair_shared_blobs( BlueStoreRepairer& repairer, shared_blob_2hash_tracker_t& sb_ref_counts, sb_info_space_efficient_map_t& sb_info) { auto sb_ref_mismatches = sb_ref_counts.count_non_zero(); dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: " << sb_ref_mismatches << dendl; if (!sb_ref_mismatches) // not expected to succeed, just in case return; auto foreach_shared_blob = [&](std::function< void (coll_t, ghobject_t, uint64_t, const bluestore_blob_t&)> cb) { auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); if (it) { CollectionRef c; spg_t pgid; for (it->lower_bound(string()); it->valid(); it->next()) { dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl; if (is_extent_shard_key(it->key())) { continue; } ghobject_t oid; int r = get_key_object(it->key(), &oid); if (r < 0) { continue; } if (!c || oid.shard_id != pgid.shard || oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || !c->contains(oid)) { c = nullptr; for (auto& p : coll_map) { if (p.second->contains(oid)) { c = p.second; break; } } if (!c) { continue; } } dout(20) << __func__ << " inspecting shared blob refs for col:" << c->cid << " obj:" << oid << dendl; OnodeRef o; o.reset(Onode::decode(c, oid, it->key(), it->value())); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); _dump_onode<30>(cct, *o); mempool::bluestore_fsck::set passed_sbs; for (auto& e : o->extent_map.extent_map) { auto& b = e.blob->get_blob(); if (b.is_shared() && passed_sbs.count(e.blob) == 0) { auto sbid = e.blob->shared_blob->get_sbid(); cb(c->cid, oid, sbid, b); passed_sbs.emplace(e.blob); } } // for ... extent_map } // for ... it->valid } //if (it(PREFIX_OBJ)) }; //foreach_shared_blob fn declaration mempool::bluestore_fsck::map refs_map; // first iteration over objects to identify all the broken sbids foreach_shared_blob( [&](coll_t cid, ghobject_t oid, uint64_t sbid, const bluestore_blob_t& b) { auto it = refs_map.lower_bound(sbid); if(it != refs_map.end() && it->first == sbid) { return; } for (auto& p : b.get_extents()) { if (p.is_valid() && !sb_ref_counts.test_all_zero_range(sbid, p.offset, p.length)) { refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t()); dout(20) << __func__ << " broken shared blob found for col:" << cid << " obj:" << oid << " sbid 0x " << std::hex << sbid << std::dec << dendl; break; } } }); // second iteration over objects to build new ref map for the broken sbids foreach_shared_blob( [&](coll_t cid, ghobject_t oid, uint64_t sbid, const bluestore_blob_t& b) { auto it = refs_map.find(sbid); if(it == refs_map.end()) { return; } for (auto& p : b.get_extents()) { if (p.is_valid()) { it->second.get(p.offset, p.length); break; } } }); // update shared blob records auto ref_it = refs_map.begin(); while (ref_it != refs_map.end()) { size_t cnt = 0; const size_t max_transactions = 4096; KeyValueDB::Transaction txn = db->get_transaction(); for (cnt = 0; cnt < max_transactions && ref_it != refs_map.end(); ref_it++) { auto sbid = ref_it->first; dout(20) << __func__ << " repaired shared_blob 0x" << std::hex << sbid << std::dec << ref_it->second << dendl; repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0); cnt++; } if (cnt) { db->submit_transaction_sync(txn); cnt = 0; } } // remove stray shared blob records size_t cnt = 0; const size_t max_transactions = 4096; KeyValueDB::Transaction txn = db->get_transaction(); sb_info.foreach_stray([&](const sb_info_t& sbi) { auto sbid = sbi.get_sbid(); dout(20) << __func__ << " removing stray shared_blob 0x" << std::hex << sbid << std::dec << dendl; repairer.fix_shared_blob(txn, sbid, nullptr, 0); cnt++; if (cnt >= max_transactions) {} db->submit_transaction_sync(txn); txn = db->get_transaction(); cnt = 0; }); if (cnt > 0) { db->submit_transaction_sync(txn); } // amount of repairs to report to be equal to previously // determined error estimation, not the actual number of updated shared blobs repairer.inc_repaired(sb_ref_mismatches); } BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( BlueStore::FSCKDepth depth, int64_t pool_id, BlueStore::CollectionRef c, const ghobject_t& oid, const string& key, const bufferlist& value, mempool::bluestore_fsck::list* expecting_shards, map* referenced, const BlueStore::FSCK_ObjectCtx& ctx) { auto& errors = ctx.errors; auto& num_objects = ctx.num_objects; auto& num_extents = ctx.num_extents; auto& num_blobs = ctx.num_blobs; auto& num_sharded_objects = ctx.num_sharded_objects; auto& num_spanning_blobs = ctx.num_spanning_blobs; auto used_blocks = ctx.used_blocks; auto sb_info_lock = ctx.sb_info_lock; auto& sb_info = ctx.sb_info; auto& sb_ref_counts = ctx.sb_ref_counts; auto repairer = ctx.repairer; store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ? &ctx.expected_pool_statfs[pool_id] : &ctx.expected_store_statfs; dout(10) << __func__ << " " << oid << dendl; OnodeRef o; o.reset(Onode::decode(c, oid, key, value)); ++num_objects; num_spanning_blobs += o->extent_map.spanning_blob_map.size(); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); _dump_onode<30>(cct, *o); // shards if (!o->extent_map.shards.empty()) { ++num_sharded_objects; if (depth != FSCK_SHALLOW) { ceph_assert(expecting_shards); for (auto& s : o->extent_map.shards) { dout(20) << __func__ << " shard " << *s.shard_info << dendl; expecting_shards->push_back(string()); get_extent_shard_key(o->key, s.shard_info->offset, &expecting_shards->back()); if (s.shard_info->offset >= o->onode.size) { derr << "fsck error: " << oid << " shard 0x" << std::hex << s.shard_info->offset << " past EOF at 0x" << o->onode.size << std::dec << dendl; ++errors; } } } } // lextents uint64_t pos = 0; mempool::bluestore_fsck::map ref_map; for (auto& l : o->extent_map.extent_map) { dout(20) << __func__ << " " << l << dendl; if (l.logical_offset < pos) { derr << "fsck error: " << oid << " lextent at 0x" << std::hex << l.logical_offset << " overlaps with the previous, which ends at 0x" << pos << std::dec << dendl; ++errors; } if (depth != FSCK_SHALLOW && o->extent_map.spans_shard(l.logical_offset, l.length)) { derr << "fsck error: " << oid << " lextent at 0x" << std::hex << l.logical_offset << "~" << l.length << " spans a shard boundary" << std::dec << dendl; ++errors; } pos = l.logical_offset + l.length; res_statfs->data_stored += l.length; ceph_assert(l.blob); const bluestore_blob_t& blob = l.blob->get_blob(); auto& ref = ref_map[l.blob]; if (ref.is_empty()) { uint32_t min_release_size = blob.get_release_size(min_alloc_size); uint32_t l = blob.get_logical_length(); ref.init(l, min_release_size); } ref.get( l.blob_offset, l.length); ++num_extents; if (depth != FSCK_SHALLOW && blob.has_unused()) { ceph_assert(referenced); auto p = referenced->find(l.blob); bluestore_blob_t::unused_t* pu; if (p == referenced->end()) { pu = &(*referenced)[l.blob]; } else { pu = &p->second; } uint64_t blob_len = blob.get_logical_length(); ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0); ceph_assert(l.blob_offset + l.length <= blob_len); uint64_t chunk_size = blob_len / (sizeof(*pu) * 8); uint64_t start = l.blob_offset / chunk_size; uint64_t end = round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size; for (auto i = start; i < end; ++i) { (*pu) |= (1u << i); } } } //for (auto& l : o->extent_map.extent_map) for (auto& i : ref_map) { ++num_blobs; const bluestore_blob_t& blob = i.first->get_blob(); bool equal = depth == FSCK_SHALLOW ? true : i.first->get_blob_use_tracker().equal(i.second); if (!equal) { derr << "fsck error: " << oid << " blob " << *i.first << " doesn't match expected ref_map " << i.second << dendl; ++errors; } if (blob.is_compressed()) { res_statfs->data_compressed += blob.get_compressed_payload_length(); res_statfs->data_compressed_original += i.first->get_referenced_bytes(); } if (depth != FSCK_SHALLOW && repairer) { for (auto e : blob.get_extents()) { if (!e.is_valid()) continue; repairer->set_space_used(e.offset, e.length, c->cid, oid); } } if (blob.is_shared()) { if (i.first->shared_blob->get_sbid() > blobid_max) { derr << "fsck error: " << oid << " blob " << blob << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max " << blobid_max << dendl; ++errors; } else if (i.first->shared_blob->get_sbid() == 0) { derr << "fsck error: " << oid << " blob " << blob << " marked as shared but has uninitialized sbid" << dendl; ++errors; } // the below lock is optional and provided in multithreading mode only if (sb_info_lock) { sb_info_lock->lock(); } auto sbid = i.first->shared_blob->get_sbid(); sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid()); ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID || sbi.pool_id == oid.hobj.get_logical_pool()); sbi.pool_id = oid.hobj.get_logical_pool(); bool compressed = blob.is_compressed(); for (auto e : blob.get_extents()) { if (e.is_valid()) { if (compressed) { ceph_assert(sbi.allocated_chunks <= 0); sbi.allocated_chunks -= (e.length >> min_alloc_size_order); } else { ceph_assert(sbi.allocated_chunks >= 0); sbi.allocated_chunks += (e.length >> min_alloc_size_order); } sb_ref_counts.inc_range(sbid, e.offset, e.length, 1); } } if (sb_info_lock) { sb_info_lock->unlock(); } } else if (depth != FSCK_SHALLOW) { ceph_assert(used_blocks); string ctx_descr = " oid " + stringify(oid); errors += _fsck_check_extents(ctx_descr, blob.get_extents(), blob.is_compressed(), *used_blocks, fm->get_alloc_size(), repairer, *res_statfs, depth); } else { errors += _fsck_sum_extents( blob.get_extents(), blob.is_compressed(), *res_statfs); } } // for (auto& i : ref_map) { auto &sbm = o->extent_map.spanning_blob_map; size_t broken = 0; BlobRef first_broken; for (auto it = sbm.begin(); it != sbm.end();) { auto it1 = it++; if (ref_map.count(it1->second) == 0) { if (!broken) { first_broken = it1->second; ++errors; } broken++; if (repairer) { sbm.erase(it1); } } } if (broken) { derr << "fsck error: " << oid << " - " << broken << " zombie spanning blob(s) found, the first one: " << *first_broken << dendl; if(repairer) { repairer->fix_spanning_blobs( db, [&](KeyValueDB::Transaction txn) { _record_onode(o, txn); }); } } } if (o->onode.has_omap()) { _fsck_check_object_omap(depth, o, ctx); } return o; } #include "common/WorkQueue.h" class ShallowFSCKThreadPool : public ThreadPool { public: ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) : ThreadPool(cct_, nm, tn, n) { } void worker(ThreadPool::WorkThread* wt) override { int next_wq = 0; while (!_stop) { next_wq %= work_queues.size(); WorkQueue_ *wq = work_queues[next_wq++]; void* item = wq->_void_dequeue(); if (item) { processing++; TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval); wq->_void_process(item, tp_handle); processing--; } } } template struct FSCKWorkQueue : public ThreadPool::WorkQueue_ { struct Entry { int64_t pool_id; BlueStore::CollectionRef c; ghobject_t oid; string key; bufferlist value; }; struct Batch { std::atomic running = { 0 }; size_t entry_count = 0; std::array entries; int64_t errors = 0; int64_t warnings = 0; uint64_t num_objects = 0; uint64_t num_extents = 0; uint64_t num_blobs = 0; uint64_t num_sharded_objects = 0; uint64_t num_spanning_blobs = 0; store_statfs_t expected_store_statfs; BlueStore::per_pool_statfs expected_pool_statfs; }; size_t batchCount; BlueStore* store = nullptr; ceph::mutex* sb_info_lock = nullptr; sb_info_space_efficient_map_t* sb_info = nullptr; shared_blob_2hash_tracker_t* sb_ref_counts = nullptr; BlueStoreRepairer* repairer = nullptr; Batch* batches = nullptr; size_t last_batch_pos = 0; bool batch_acquired = false; FSCKWorkQueue(std::string n, size_t _batchCount, BlueStore* _store, ceph::mutex* _sb_info_lock, sb_info_space_efficient_map_t& _sb_info, shared_blob_2hash_tracker_t& _sb_ref_counts, BlueStoreRepairer* _repairer) : WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()), batchCount(_batchCount), store(_store), sb_info_lock(_sb_info_lock), sb_info(&_sb_info), sb_ref_counts(&_sb_ref_counts), repairer(_repairer) { batches = new Batch[batchCount]; } ~FSCKWorkQueue() { delete[] batches; } /// Remove all work items from the queue. void _clear() override { //do nothing } /// Check whether there is anything to do. bool _empty() override { ceph_assert(false); } /// Get the next work item to process. void* _void_dequeue() override { size_t pos = rand() % batchCount; size_t pos0 = pos; do { auto& batch = batches[pos]; if (batch.running.fetch_add(1) == 0) { if (batch.entry_count) { return &batch; } } batch.running--; pos++; pos %= batchCount; } while (pos != pos0); return nullptr; } /** @brief Process the work item. * This function will be called several times in parallel * and must therefore be thread-safe. */ void _void_process(void* item, TPHandle& handle) override { Batch* batch = (Batch*)item; BlueStore::FSCK_ObjectCtx ctx( batch->errors, batch->warnings, batch->num_objects, batch->num_extents, batch->num_blobs, batch->num_sharded_objects, batch->num_spanning_blobs, nullptr, // used_blocks nullptr, //used_omap_head sb_info_lock, *sb_info, *sb_ref_counts, batch->expected_store_statfs, batch->expected_pool_statfs, repairer); for (size_t i = 0; i < batch->entry_count; i++) { auto& entry = batch->entries[i]; store->fsck_check_objects_shallow( BlueStore::FSCK_SHALLOW, entry.pool_id, entry.c, entry.oid, entry.key, entry.value, nullptr, // expecting_shards - this will need a protection if passed nullptr, // referenced ctx); } //std::cout << "processed " << batch << std::endl; batch->entry_count = 0; batch->running--; } /** @brief Synchronously finish processing a work item. * This function is called after _void_process with the global thread pool lock held, * so at most one copy will execute simultaneously for a given thread pool. * It can be used for non-thread-safe finalization. */ void _void_process_finish(void*) override { ceph_assert(false); } bool queue( int64_t pool_id, BlueStore::CollectionRef c, const ghobject_t& oid, const string& key, const bufferlist& value) { bool res = false; size_t pos0 = last_batch_pos; if (!batch_acquired) { do { auto& batch = batches[last_batch_pos]; if (batch.running.fetch_add(1) == 0) { if (batch.entry_count < BatchLen) { batch_acquired = true; break; } } batch.running.fetch_sub(1); last_batch_pos++; last_batch_pos %= batchCount; } while (last_batch_pos != pos0); } if (batch_acquired) { auto& batch = batches[last_batch_pos]; ceph_assert(batch.running); ceph_assert(batch.entry_count < BatchLen); auto& entry = batch.entries[batch.entry_count]; entry.pool_id = pool_id; entry.c = c; entry.oid = oid; entry.key = key; entry.value = value; ++batch.entry_count; if (batch.entry_count == BatchLen) { batch_acquired = false; batch.running.fetch_sub(1); last_batch_pos++; last_batch_pos %= batchCount; } res = true; } return res; } void finalize(ThreadPool& tp, BlueStore::FSCK_ObjectCtx& ctx) { if (batch_acquired) { auto& batch = batches[last_batch_pos]; ceph_assert(batch.running); batch.running.fetch_sub(1); } tp.stop(); for (size_t i = 0; i < batchCount; i++) { auto& batch = batches[i]; //process leftovers if any if (batch.entry_count) { TPHandle tp_handle(store->cct, nullptr, timeout_interval, suicide_interval); ceph_assert(batch.running == 0); batch.running++; // just to be on-par with the regular call _void_process(&batch, tp_handle); } ceph_assert(batch.entry_count == 0); ctx.errors += batch.errors; ctx.warnings += batch.warnings; ctx.num_objects += batch.num_objects; ctx.num_extents += batch.num_extents; ctx.num_blobs += batch.num_blobs; ctx.num_sharded_objects += batch.num_sharded_objects; ctx.num_spanning_blobs += batch.num_spanning_blobs; ctx.expected_store_statfs.add(batch.expected_store_statfs); for (auto it = batch.expected_pool_statfs.begin(); it != batch.expected_pool_statfs.end(); it++) { ctx.expected_pool_statfs[it->first].add(it->second); } } } }; }; void BlueStore::_fsck_check_object_omap(FSCKDepth depth, OnodeRef& o, const BlueStore::FSCK_ObjectCtx& ctx) { auto& errors = ctx.errors; auto& warnings = ctx.warnings; auto repairer = ctx.repairer; ceph_assert(o->onode.has_omap()); if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) { if (per_pool_omap == OMAP_PER_POOL) { fsck_derr(errors, MAX_FSCK_ERROR_LINES) << "fsck error: " << o->oid << " has omap that is not per-pool or pgmeta" << fsck_dendl; ++errors; } else { const char* w; int64_t num; if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) { ++errors; num = errors; w = "error"; } else { ++warnings; num = warnings; w = "warning"; } fsck_derr(num, MAX_FSCK_ERROR_LINES) << "fsck " << w << ": " << o->oid << " has omap that is not per-pool or pgmeta" << fsck_dendl; } } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) { if (per_pool_omap == OMAP_PER_PG) { fsck_derr(errors, MAX_FSCK_ERROR_LINES) << "fsck error: " << o->oid << " has omap that is not per-pg or pgmeta" << fsck_dendl; ++errors; } else { const char* w; int64_t num; if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) { ++errors; num = errors; w = "error"; } else { ++warnings; num = warnings; w = "warning"; } fsck_derr(num, MAX_FSCK_ERROR_LINES) << "fsck " << w << ": " << o->oid << " has omap that is not per-pg or pgmeta" << fsck_dendl; } } if (repairer && !o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) { dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl; bufferlist header; map kv; { KeyValueDB::Transaction txn = db->get_transaction(); uint64_t txn_cost = 0; const string& prefix = Onode::calc_omap_prefix(o->onode.flags); uint8_t new_flags = o->onode.flags | bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP; const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags); KeyValueDB::Iterator it = db->get_iterator(prefix); string head, tail; o->get_omap_header(&head); o->get_omap_tail(&tail); it->lower_bound(head); // head if (it->valid() && it->key() == head) { dout(30) << __func__ << " got header" << dendl; header = it->value(); if (header.length()) { string new_head; Onode::calc_omap_header(new_flags, o.get(), &new_head); txn->set(new_omap_prefix, new_head, header); txn_cost += new_head.length() + header.length(); } it->next(); } // tail { string new_tail; Onode::calc_omap_tail(new_flags, o.get(), &new_tail); bufferlist empty; txn->set(new_omap_prefix, new_tail, empty); txn_cost += new_tail.length() + new_tail.length(); } // values string final_key; Onode::calc_omap_key(new_flags, o.get(), string(), &final_key); size_t base_key_len = final_key.size(); while (it->valid() && it->key() < tail) { string user_key; o->decode_omap_key(it->key(), &user_key); dout(20) << __func__ << " got " << pretty_binary_string(it->key()) << " -> " << user_key << dendl; final_key.resize(base_key_len); final_key += user_key; auto v = it->value(); txn->set(new_omap_prefix, final_key, v); txn_cost += final_key.length() + v.length(); // submit a portion if cost exceeds 16MB if (txn_cost >= 16 * (1 << 20) ) { db->submit_transaction_sync(txn); txn = db->get_transaction(); txn_cost = 0; } it->next(); } if (txn_cost > 0) { db->submit_transaction_sync(txn); } } // finalize: remove legacy data { KeyValueDB::Transaction txn = db->get_transaction(); // remove old keys const string& old_omap_prefix = o->get_omap_prefix(); string old_head, old_tail; o->get_omap_header(&old_head); o->get_omap_tail(&old_tail); txn->rm_range_keys(old_omap_prefix, old_head, old_tail); txn->rmkey(old_omap_prefix, old_tail); // set flag o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP); _record_onode(o, txn); db->submit_transaction_sync(txn); repairer->inc_repaired(); repairer->request_compaction(); } } } void BlueStore::_fsck_check_objects(FSCKDepth depth, BlueStore::FSCK_ObjectCtx& ctx) { auto& errors = ctx.errors; auto sb_info_lock = ctx.sb_info_lock; auto& sb_info = ctx.sb_info; auto& sb_ref_counts = ctx.sb_ref_counts; auto repairer = ctx.repairer; uint64_t_btree_t used_nids; size_t processed_myself = 0; auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); mempool::bluestore_fsck::list expecting_shards; if (it) { const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads; typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ; std::unique_ptr wq( new WQ( "FSCKWorkQueue", (thread_count ? : 1) * 32, this, sb_info_lock, sb_info, sb_ref_counts, repairer)); ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count); thread_pool.add_work_queue(wq.get()); if (depth == FSCK_SHALLOW && thread_count > 0) { //not the best place but let's check anyway ceph_assert(sb_info_lock); thread_pool.start(); } //fill global if not overriden below CollectionRef c; int64_t pool_id = -1; spg_t pgid; for (it->lower_bound(string()); it->valid(); it->next()) { dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl; if (is_extent_shard_key(it->key())) { if (depth == FSCK_SHALLOW) { continue; } while (!expecting_shards.empty() && expecting_shards.front() < it->key()) { derr << "fsck error: missing shard key " << pretty_binary_string(expecting_shards.front()) << dendl; ++errors; expecting_shards.pop_front(); } if (!expecting_shards.empty() && expecting_shards.front() == it->key()) { // all good expecting_shards.pop_front(); continue; } uint32_t offset; string okey; get_key_extent_shard(it->key(), &okey, &offset); derr << "fsck error: stray shard 0x" << std::hex << offset << std::dec << dendl; if (expecting_shards.empty()) { derr << "fsck error: " << pretty_binary_string(it->key()) << " is unexpected" << dendl; ++errors; continue; } while (expecting_shards.front() > it->key()) { derr << "fsck error: saw " << pretty_binary_string(it->key()) << dendl; derr << "fsck error: exp " << pretty_binary_string(expecting_shards.front()) << dendl; ++errors; expecting_shards.pop_front(); if (expecting_shards.empty()) { break; } } continue; } ghobject_t oid; int r = get_key_object(it->key(), &oid); if (r < 0) { derr << "fsck error: bad object key " << pretty_binary_string(it->key()) << dendl; ++errors; continue; } if (!c || oid.shard_id != pgid.shard || oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || !c->contains(oid)) { c = nullptr; for (auto& p : coll_map) { if (p.second->contains(oid)) { c = p.second; break; } } if (!c) { derr << "fsck error: stray object " << oid << " not owned by any collection" << dendl; ++errors; continue; } pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID; dout(20) << __func__ << " collection " << c->cid << " " << c->cnode << dendl; } if (depth != FSCK_SHALLOW && !expecting_shards.empty()) { for (auto& k : expecting_shards) { derr << "fsck error: missing shard key " << pretty_binary_string(k) << dendl; } ++errors; expecting_shards.clear(); } bool queued = false; if (depth == FSCK_SHALLOW && thread_count > 0) { queued = wq->queue( pool_id, c, oid, it->key(), it->value()); } OnodeRef o; map referenced; if (!queued) { ++processed_myself; o = fsck_check_objects_shallow( depth, pool_id, c, oid, it->key(), it->value(), &expecting_shards, &referenced, ctx); } if (depth != FSCK_SHALLOW) { ceph_assert(o != nullptr); if (o->onode.nid) { if (o->onode.nid > nid_max) { derr << "fsck error: " << oid << " nid " << o->onode.nid << " > nid_max " << nid_max << dendl; ++errors; } if (used_nids.count(o->onode.nid)) { derr << "fsck error: " << oid << " nid " << o->onode.nid << " already in use" << dendl; ++errors; continue; // go for next object } used_nids.insert(o->onode.nid); } for (auto& i : referenced) { dout(20) << __func__ << " referenced 0x" << std::hex << i.second << std::dec << " for " << *i.first << dendl; const bluestore_blob_t& blob = i.first->get_blob(); if (i.second & blob.unused) { derr << "fsck error: " << oid << " blob claims unused 0x" << std::hex << blob.unused << " but extents reference 0x" << i.second << std::dec << " on blob " << *i.first << dendl; ++errors; } if (blob.has_csum()) { uint64_t blob_len = blob.get_logical_length(); uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8); unsigned csum_count = blob.get_csum_count(); unsigned csum_chunk_size = blob.get_csum_chunk_size(); for (unsigned p = 0; p < csum_count; ++p) { unsigned pos = p * csum_chunk_size; unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit] unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size; unsigned mask = 1u << firstbit; for (unsigned b = firstbit + 1; b <= lastbit; ++b) { mask |= 1u << b; } if ((blob.unused & mask) == mask) { // this csum chunk region is marked unused if (blob.get_csum_item(p) != 0) { derr << "fsck error: " << oid << " blob claims csum chunk 0x" << std::hex << pos << "~" << csum_chunk_size << " is unused (mask 0x" << mask << " of unused 0x" << blob.unused << ") but csum is non-zero 0x" << blob.get_csum_item(p) << std::dec << " on blob " << *i.first << dendl; ++errors; } } } } } // omap if (o->onode.has_omap()) { ceph_assert(ctx.used_omap_head); if (ctx.used_omap_head->count(o->onode.nid)) { derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid << " already in use" << dendl; ++errors; } else { ctx.used_omap_head->insert(o->onode.nid); } } // if (o->onode.has_omap()) if (depth == FSCK_DEEP) { bufferlist bl; uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap; uint64_t offset = 0; do { uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block); int r = _do_read(c.get(), o, offset, l, bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); if (r < 0) { ++errors; derr << "fsck error: " << oid << std::hex << " error during read: " << " " << offset << "~" << l << " " << cpp_strerror(r) << std::dec << dendl; break; } offset += l; } while (offset < o->onode.size); } // deep } //if (depth != FSCK_SHALLOW) } // for (it->lower_bound(string()); it->valid(); it->next()) if (depth == FSCK_SHALLOW && thread_count > 0) { wq->finalize(thread_pool, ctx); if (processed_myself) { // may be needs more threads? dout(0) << __func__ << " partial offload" << ", done myself " << processed_myself << " of " << ctx.num_objects << "objects, threads " << thread_count << dendl; } } } // if (it) } /** An overview for currently implemented repair logics performed in fsck in two stages: detection(+preparation) and commit. Detection stage (in processing order): (Issue -> Repair action to schedule) - Detect undecodable keys for Shared Blobs -> Remove - Detect undecodable records for Shared Blobs -> Remove (might trigger missed Shared Blob detection below) - Detect stray records for Shared Blobs -> Remove - Detect misreferenced pextents -> Fix Prepare Bloom-like filter to track cid/oid -> pextent Prepare list of extents that are improperly referenced Enumerate Onode records that might use 'misreferenced' pextents (Bloom-like filter applied to reduce computation) Per each questinable Onode enumerate all blobs and identify broken ones (i.e. blobs having 'misreferences') Rewrite each broken blob data by allocating another extents and copying data there If blob is shared - unshare it and mark corresponding Shared Blob for removal Release previously allocated space Update Extent Map - Detect missed Shared Blobs -> Recreate - Detect undecodable deferred transaction -> Remove - Detect Freelist Manager's 'false free' entries -> Mark as used - Detect Freelist Manager's leaked entries -> Mark as free - Detect statfs inconsistency - Update Commit stage (separate DB commit per each step): - Apply leaked FM entries fix - Apply 'false free' FM entries fix - Apply 'Remove' actions - Apply fix for misreference pextents - Apply Shared Blob recreate (can be merged with the step above if misreferences were dectected) - Apply StatFS update */ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair) { dout(1) << __func__ << (repair ? " repair" : " check") << (depth == FSCK_DEEP ? " (deep)" : depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") << dendl; // in deep mode we need R/W write access to be able to replay deferred ops bool read_only = !(repair || depth == FSCK_DEEP); int r = _open_db_and_around(read_only); if (r < 0) return r; if (!read_only) { r = _upgrade_super(); if (r < 0) { goto out_db; } } r = _open_collections(); if (r < 0) goto out_db; mempool_thread.init(); // we need finisher and kv_{sync,finalize}_thread *just* for replay // enable in repair or deep mode modes only if (!read_only) { _kv_start(); r = _deferred_replay(); _kv_stop(); } if (r < 0) goto out_scan; r = _fsck_on_open(depth, repair); out_scan: mempool_thread.shutdown(); _shutdown_cache(); out_db: _close_db_and_around(false); return r; } int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) { uint64_t sb_hash_size = uint64_t( cct->_conf.get_val("osd_memory_target") * cct->_conf.get_val( "bluestore_fsck_shared_blob_tracker_size")); dout(1) << __func__ << " <<>>" << (repair ? " repair" : " check") << (depth == FSCK_DEEP ? " (deep)" : depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") << " start sb_tracker_hash_size:" << sb_hash_size << dendl; int64_t errors = 0; int64_t warnings = 0; unsigned repaired = 0; uint64_t_btree_t used_omap_head; uint64_t_btree_t used_sbids; mempool_dynamic_bitset used_blocks, bluefs_used_blocks; KeyValueDB::Iterator it; store_statfs_t expected_store_statfs, actual_statfs; per_pool_statfs expected_pool_statfs; sb_info_space_efficient_map_t sb_info; shared_blob_2hash_tracker_t sb_ref_counts( sb_hash_size, min_alloc_size); size_t sb_ref_mismatches = 0; uint64_t num_objects = 0; uint64_t num_extents = 0; uint64_t num_blobs = 0; uint64_t num_spanning_blobs = 0; uint64_t num_shared_blobs = 0; uint64_t num_sharded_objects = 0; BlueStoreRepairer repairer; auto alloc_size = fm->get_alloc_size(); utime_t start = ceph_clock_now(); _fsck_collections(&errors); used_blocks.resize(fm->get_alloc_units()); if (bluefs) { interval_set bluefs_extents; int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents); ceph_assert(r == 0); for (auto [start, len] : bluefs_extents) { apply_for_bitset_range(start, len, alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset& bs) { ceph_assert(pos < bs.size()); bs.set(pos); } ); } } bluefs_used_blocks = used_blocks; apply_for_bitset_range( 0, std::max(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { bs.set(pos); } ); if (repair) { repairer.init_space_usage_tracker( bdev->get_size(), min_alloc_size); } if (bluefs) { int r = bluefs->fsck(); if (r < 0) { return r; } if (r > 0) errors += r; } if (!per_pool_stat_collection) { const char *w; if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) { w = "error"; ++errors; } else { w = "warning"; ++warnings; } derr << "fsck " << w << ": store not yet converted to per-pool stats" << dendl; } if (per_pool_omap != OMAP_PER_PG) { const char *w; if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) { w = "error"; ++errors; } else { w = "warning"; ++warnings; } derr << "fsck " << w << ": store not yet converted to per-pg omap" << dendl; } // get expected statfs; reset unaffected fields to be able to compare // structs statfs(&actual_statfs); actual_statfs.total = 0; actual_statfs.internally_reserved = 0; actual_statfs.available = 0; actual_statfs.internal_metadata = 0; actual_statfs.omap_allocated = 0; if (g_conf()->bluestore_debug_fsck_abort) { dout(1) << __func__ << " debug abort" << dendl; goto out_scan; } dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl; it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); if (it) { for (it->lower_bound(string()); it->valid(); it->next()) { string key = it->key(); uint64_t sbid; if (get_key_shared_blob(key, &sbid) < 0) { // Failed to parse the key. // This gonna to be handled at the second stage continue; } bluestore_shared_blob_t shared_blob(sbid); bufferlist bl = it->value(); auto blp = bl.cbegin(); try { decode(shared_blob, blp); } catch (ceph::buffer::error& e) { // this gonna to be handled at the second stage continue; } dout(20) << __func__ << " " << shared_blob << dendl; auto& sbi = sb_info.add_maybe_stray(sbid); // primarily to silent the 'unused' warning ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID); for (auto& r : shared_blob.ref_map.ref_map) { sb_ref_counts.inc_range( sbid, r.first, r.second.length, -r.second.refs); } } } // if (it) //checking shared_blobs (phase1) // walk PREFIX_OBJ { dout(1) << __func__ << " walking object keyspace" << dendl; ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock"); BlueStore::FSCK_ObjectCtx ctx( errors, warnings, num_objects, num_extents, num_blobs, num_sharded_objects, num_spanning_blobs, &used_blocks, &used_omap_head, //no need for the below lock when in non-shallow mode as // there is no multithreading in this case depth == FSCK_SHALLOW ? &sb_info_lock : nullptr, sb_info, sb_ref_counts, expected_store_statfs, expected_pool_statfs, repair ? &repairer : nullptr); _fsck_check_objects(depth, ctx); } sb_ref_mismatches = sb_ref_counts.count_non_zero(); if (sb_ref_mismatches != 0) { derr << "fsck error: shared blob references aren't matching, at least " << sb_ref_mismatches << " found" << dendl; errors += sb_ref_mismatches; } if (depth != FSCK_SHALLOW && repair) { _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info); } dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl; it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); if (it) { // FIXME minor: perhaps simplify for shallow mode? // fill global if not overriden below auto expected_statfs = &expected_store_statfs; for (it->lower_bound(string()); it->valid(); it->next()) { string key = it->key(); uint64_t sbid; if (get_key_shared_blob(key, &sbid)) { derr << "fsck error: bad key '" << key << "' in shared blob namespace" << dendl; if (repair) { repairer.remove_key(db, PREFIX_SHARED_BLOB, key); } ++errors; continue; } auto p = sb_info.find(sbid); if (p == sb_info.end()) { if (sb_ref_mismatches > 0) { // highly likely this has been already reported before, ignoring... dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x" << std::hex << sbid << std::dec << dendl; } else { derr<< "fsck error: found stray shared blob data for sbid 0x" << std::hex << sbid << std::dec << dendl; ++errors; if (repair) { repairer.remove_key(db, PREFIX_SHARED_BLOB, key); } } } else { ++num_shared_blobs; sb_info_t& sbi = *p; bluestore_shared_blob_t shared_blob(sbid); bufferlist bl = it->value(); auto blp = bl.cbegin(); try { decode(shared_blob, blp); } catch (ceph::buffer::error& e) { ++errors; derr << "fsck error: failed to decode Shared Blob" << pretty_binary_string(key) << dendl; if (repair) { dout(20) << __func__ << " undecodable Shared Blob, key:'" << pretty_binary_string(key) << "', removing" << dendl; repairer.remove_key(db, PREFIX_SHARED_BLOB, key); } continue; } dout(20) << __func__ << " " << shared_blob << dendl; PExtentVector extents; for (auto& r : shared_blob.ref_map.ref_map) { extents.emplace_back(bluestore_pextent_t(r.first, r.second.length)); } if (sbi.pool_id != sb_info_t::INVALID_POOL_ID && (per_pool_stat_collection || repair)) { expected_statfs = &expected_pool_statfs[sbi.pool_id]; } std::stringstream ss; ss << "sbid 0x" << std::hex << sbid << std::dec; errors += _fsck_check_extents(ss.str(), extents, sbi.allocated_chunks < 0, used_blocks, fm->get_alloc_size(), repair ? &repairer : nullptr, *expected_statfs, depth); } } } // if (it) /* checking shared_blobs (phase 2)*/ if (repair && repairer.preprocess_misreference(db)) { dout(1) << __func__ << " sorting out misreferenced extents" << dendl; auto& misref_extents = repairer.get_misreferences(); interval_set to_release; it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); if (it) { // fill global if not overriden below auto expected_statfs = &expected_store_statfs; CollectionRef c; spg_t pgid; KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn(); bool bypass_rest = false; for (it->lower_bound(string()); it->valid() && !bypass_rest; it->next()) { dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl; if (is_extent_shard_key(it->key())) { continue; } ghobject_t oid; int r = get_key_object(it->key(), &oid); if (r < 0 || !repairer.is_used(oid)) { continue; } if (!c || oid.shard_id != pgid.shard || oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || !c->contains(oid)) { c = nullptr; for (auto& p : coll_map) { if (p.second->contains(oid)) { c = p.second; break; } } if (!c) { continue; } if (per_pool_stat_collection || repair) { auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID; expected_statfs = &expected_pool_statfs[pool_id]; } } if (!repairer.is_used(c->cid)) { continue; } dout(20) << __func__ << " check misreference for col:" << c->cid << " obj:" << oid << dendl; OnodeRef o; o.reset(Onode::decode(c, oid, it->key(), it->value())); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); mempool::bluestore_fsck::set blobs; for (auto& e : o->extent_map.extent_map) { blobs.insert(e.blob); } bool need_onode_update = false; bool first_dump = true; for(auto b : blobs) { bool broken_blob = false; auto& pextents = b->dirty_blob().dirty_extents(); for (auto& e : pextents) { if (!e.is_valid()) { continue; } // for the sake of simplicity and proper shared blob handling // always rewrite the whole blob even when it's partially // misreferenced. if (misref_extents.intersects(e.offset, e.length)) { if (first_dump) { first_dump = false; _dump_onode<10>(cct, *o); } broken_blob = true; break; } } if (!broken_blob) continue; bool compressed = b->get_blob().is_compressed(); need_onode_update = true; dout(10) << __func__ << " fix misreferences in oid:" << oid << " " << *b << dendl; uint64_t b_off = 0; PExtentVector pext_to_release; pext_to_release.reserve(pextents.size()); // rewriting all valid pextents for (auto e = pextents.begin(); e != pextents.end(); e++) { auto b_off_cur = b_off; b_off += e->length; if (!e->is_valid()) { continue; } PExtentVector exts; int64_t alloc_len = shared_alloc.a->allocate(e->length, min_alloc_size, 0, 0, &exts); if (alloc_len < 0 || alloc_len < (int64_t)e->length) { derr << __func__ << " failed to allocate 0x" << std::hex << e->length << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len) << " min_alloc_size 0x" << min_alloc_size << " available 0x " << shared_alloc.a->get_free() << std::dec << dendl; if (alloc_len > 0) { shared_alloc.a->release(exts); } bypass_rest = true; break; } expected_statfs->allocated += e->length; if (compressed) { expected_statfs->data_compressed_allocated += e->length; } bufferlist bl; IOContext ioc(cct, NULL, true); // allow EIO r = bdev->read(e->offset, e->length, &bl, &ioc, false); if (r < 0) { derr << __func__ << " failed to read from 0x" << std::hex << e->offset <<"~" << e->length << std::dec << dendl; ceph_abort_msg("read failed, wtf"); } pext_to_release.push_back(*e); e = pextents.erase(e); e = pextents.insert(e, exts.begin(), exts.end()); b->get_blob().map_bl( b_off_cur, bl, [&](uint64_t offset, bufferlist& t) { int r = bdev->write(offset, t, false); ceph_assert(r == 0); }); e += exts.size() - 1; for (auto& p : exts) { fm->allocate(p.offset, p.length, txn); } } // for (auto e = pextents.begin(); e != pextents.end(); e++) { if (b->get_blob().is_shared()) { b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED); auto sbid = b->shared_blob->get_sbid(); auto sb_it = sb_info.find(sbid); ceph_assert(sb_it != sb_info.end()); sb_info_t& sbi = *sb_it; if (sbi.allocated_chunks < 0) { // NB: it's crucial to use compressed_allocated_chunks from sb_info_t // as we originally used that value while accumulating // expected_statfs expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order; expected_statfs->data_compressed_allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order; } else { expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order; } sbi.allocated_chunks = 0; repairer.fix_shared_blob(txn, sbid, nullptr, 0); // relying on blob's pextents to decide what to release. for (auto& p : pext_to_release) { to_release.union_insert(p.offset, p.length); } } else { for (auto& p : pext_to_release) { expected_statfs->allocated -= p.length; if (compressed) { expected_statfs->data_compressed_allocated -= p.length; } to_release.union_insert(p.offset, p.length); } } if (bypass_rest) { break; } } // for(auto b : blobs) if (need_onode_update) { o->extent_map.dirty_range(0, OBJECT_MAX_SIZE); _record_onode(o, txn); } } // for (it->lower_bound(string()); it->valid(); it->next()) for (auto it = to_release.begin(); it != to_release.end(); ++it) { dout(10) << __func__ << " release 0x" << std::hex << it.get_start() << "~" << it.get_len() << std::dec << dendl; fm->release(it.get_start(), it.get_len(), txn); } shared_alloc.a->release(to_release); to_release.clear(); } // if (it) { } //if (repair && repairer.preprocess_misreference()) { sb_info.clear(); sb_ref_counts.reset(); // check global stats only if fscking (not repairing) w/o per-pool stats if (!per_pool_stat_collection && !repair && !(actual_statfs == expected_store_statfs)) { derr << "fsck error: actual " << actual_statfs << " != expected " << expected_store_statfs << dendl; if (repair) { repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY, expected_store_statfs); } ++errors; } dout(1) << __func__ << " checking pool_statfs" << dendl; _fsck_check_pool_statfs(expected_pool_statfs, errors, warnings, repair ? &repairer : nullptr); if (depth != FSCK_SHALLOW) { dout(1) << __func__ << " checking for stray omap data " << dendl; it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE); if (it) { uint64_t last_omap_head = 0; for (it->lower_bound(string()); it->valid(); it->next()) { uint64_t omap_head; _key_decode_u64(it->key().c_str(), &omap_head); if (used_omap_head.count(omap_head) == 0 && omap_head != last_omap_head) { fsck_derr(errors, MAX_FSCK_ERROR_LINES) << "fsck error: found stray omap data on omap_head " << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl; ++errors; last_omap_head = omap_head; } } } it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE); if (it) { uint64_t last_omap_head = 0; for (it->lower_bound(string()); it->valid(); it->next()) { uint64_t omap_head; _key_decode_u64(it->key().c_str(), &omap_head); if (used_omap_head.count(omap_head) == 0 && omap_head != last_omap_head) { fsck_derr(errors, MAX_FSCK_ERROR_LINES) << "fsck error: found stray (pgmeta) omap data on omap_head " << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl; last_omap_head = omap_head; ++errors; } } } it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE); if (it) { uint64_t last_omap_head = 0; for (it->lower_bound(string()); it->valid(); it->next()) { uint64_t pool; uint64_t omap_head; string k = it->key(); const char *c = k.c_str(); c = _key_decode_u64(c, &pool); c = _key_decode_u64(c, &omap_head); if (used_omap_head.count(omap_head) == 0 && omap_head != last_omap_head) { fsck_derr(errors, MAX_FSCK_ERROR_LINES) << "fsck error: found stray (per-pool) omap data on omap_head " << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl; ++errors; last_omap_head = omap_head; } } } it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE); if (it) { uint64_t last_omap_head = 0; for (it->lower_bound(string()); it->valid(); it->next()) { uint64_t pool; uint32_t hash; uint64_t omap_head; string k = it->key(); const char* c = k.c_str(); c = _key_decode_u64(c, &pool); c = _key_decode_u32(c, &hash); c = _key_decode_u64(c, &omap_head); if (used_omap_head.count(omap_head) == 0 && omap_head != last_omap_head) { fsck_derr(errors, MAX_FSCK_ERROR_LINES) << "fsck error: found stray (per-pg) omap data on omap_head " << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl; ++errors; last_omap_head = omap_head; } } } dout(1) << __func__ << " checking deferred events" << dendl; it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE); if (it) { for (it->lower_bound(string()); it->valid(); it->next()) { bufferlist bl = it->value(); auto p = bl.cbegin(); bluestore_deferred_transaction_t wt; try { decode(wt, p); } catch (ceph::buffer::error& e) { derr << "fsck error: failed to decode deferred txn " << pretty_binary_string(it->key()) << dendl; if (repair) { dout(20) << __func__ << " undecodable deferred TXN record, key: '" << pretty_binary_string(it->key()) << "', removing" << dendl; repairer.remove_key(db, PREFIX_DEFERRED, it->key()); } continue; } dout(20) << __func__ << " deferred " << wt.seq << " ops " << wt.ops.size() << " released 0x" << std::hex << wt.released << std::dec << dendl; for (auto e = wt.released.begin(); e != wt.released.end(); ++e) { apply_for_bitset_range( e.get_start(), e.get_len(), alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { bs.set(pos); } ); } } } dout(1) << __func__ << " checking freelist vs allocated" << dendl; { fm->enumerate_reset(); uint64_t offset, length; while (fm->enumerate_next(db, &offset, &length)) { bool intersects = false; apply_for_bitset_range( offset, length, alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { ceph_assert(pos < bs.size()); if (bs.test(pos) && !bluefs_used_blocks.test(pos)) { if (offset == SUPER_RESERVED && length == min_alloc_size - SUPER_RESERVED) { // this is due to the change just after luminous to min_alloc_size // granularity allocations, and our baked in assumption at the top // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless, // since we will never allocate this region below min_alloc_size. dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED" << " and min_alloc_size, 0x" << std::hex << offset << "~" << length << std::dec << dendl; } else { intersects = true; if (repair) { repairer.fix_false_free(db, fm, pos * min_alloc_size, min_alloc_size); } } } else { bs.set(pos); } } ); if (intersects) { derr << "fsck error: free extent 0x" << std::hex << offset << "~" << length << std::dec << " intersects allocated blocks" << dendl; ++errors; } } fm->enumerate_reset(); size_t count = used_blocks.count(); if (used_blocks.size() != count) { ceph_assert(used_blocks.size() > count); used_blocks.flip(); size_t start = used_blocks.find_first(); while (start != decltype(used_blocks)::npos) { size_t cur = start; while (true) { size_t next = used_blocks.find_next(cur); if (next != cur + 1) { ++errors; derr << "fsck error: leaked extent 0x" << std::hex << ((uint64_t)start * fm->get_alloc_size()) << "~" << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec << dendl; if (repair) { repairer.fix_leaked(db, fm, start * min_alloc_size, (cur + 1 - start) * min_alloc_size); } start = next; break; } cur = next; } } used_blocks.flip(); } } } if (repair) { if (per_pool_omap != OMAP_PER_PG) { dout(5) << __func__ << " fixing per_pg_omap" << dendl; repairer.fix_per_pool_omap(db, OMAP_PER_PG); } dout(5) << __func__ << " applying repair results" << dendl; repaired = repairer.apply(db); dout(5) << __func__ << " repair applied" << dendl; } out_scan: dout(2) << __func__ << " " << num_objects << " objects, " << num_sharded_objects << " of them sharded. " << dendl; dout(2) << __func__ << " " << num_extents << " extents to " << num_blobs << " blobs, " << num_spanning_blobs << " spanning, " << num_shared_blobs << " shared." << dendl; utime_t duration = ceph_clock_now() - start; dout(1) << __func__ << " <<>> with " << errors << " errors, " << warnings << " warnings, " << repaired << " repaired, " << (errors + warnings - (int)repaired) << " remaining in " << duration << " seconds" << dendl; // In non-repair mode we should return error count only as // it indicates if store status is OK. // In repair mode both errors and warnings are taken into account // since repaired counter relates to them both. return repair ? errors + warnings - (int)repaired : errors; } /// methods to inject various errors fsck can repair void BlueStore::inject_broken_shared_blob_key(const string& key, const bufferlist& bl) { KeyValueDB::Transaction txn; txn = db->get_transaction(); txn->set(PREFIX_SHARED_BLOB, key, bl); db->submit_transaction_sync(txn); }; void BlueStore::inject_no_shared_blob_key() { KeyValueDB::Transaction txn; txn = db->get_transaction(); ceph_assert(blobid_last > 0); // kill the last used sbid, this can be broken due to blobid preallocation // in rare cases, leaving as-is for the sake of simplicity uint64_t sbid = blobid_last; string key; dout(5) << __func__<< " " << sbid << dendl; get_shared_blob_key(sbid, &key); txn->rmkey(PREFIX_SHARED_BLOB, key); db->submit_transaction_sync(txn); }; void BlueStore::inject_stray_shared_blob_key(uint64_t sbid) { KeyValueDB::Transaction txn; txn = db->get_transaction(); dout(5) << __func__ << " " << sbid << dendl; string key; get_shared_blob_key(sbid, &key); bluestore_shared_blob_t persistent(sbid); persistent.ref_map.get(0xdead0000, 0x1000); bufferlist bl; encode(persistent, bl); dout(20) << __func__ << " sbid " << sbid << " takes " << bl.length() << " bytes, updating" << dendl; txn->set(PREFIX_SHARED_BLOB, key, bl); db->submit_transaction_sync(txn); }; void BlueStore::inject_leaked(uint64_t len) { KeyValueDB::Transaction txn; txn = db->get_transaction(); PExtentVector exts; int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size, min_alloc_size * 256, 0, &exts); ceph_assert(alloc_len >= (int64_t)len); for (auto& p : exts) { fm->allocate(p.offset, p.length, txn); } db->submit_transaction_sync(txn); } void BlueStore::inject_false_free(coll_t cid, ghobject_t oid) { KeyValueDB::Transaction txn; OnodeRef o; CollectionRef c = _get_collection(cid); ceph_assert(c); { std::unique_lock l{c->lock}; // just to avoid internal asserts o = c->get_onode(oid, false); ceph_assert(o); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); } bool injected = false; txn = db->get_transaction(); auto& em = o->extent_map.extent_map; std::vector v; if (em.size()) { v.push_back(&em.begin()->blob->get_blob().get_extents()); } if (em.size() > 1) { auto it = em.end(); --it; v.push_back(&(it->blob->get_blob().get_extents())); } for (auto pext : v) { if (pext->size()) { auto p = pext->begin(); while (p != pext->end()) { if (p->is_valid()) { dout(20) << __func__ << " release 0x" << std::hex << p->offset << "~" << p->length << std::dec << dendl; fm->release(p->offset, p->length, txn); injected = true; break; } ++p; } } } ceph_assert(injected); db->submit_transaction_sync(txn); } void BlueStore::inject_legacy_omap() { dout(1) << __func__ << dendl; per_pool_omap = OMAP_BULK; KeyValueDB::Transaction txn; txn = db->get_transaction(); txn->rmkey(PREFIX_SUPER, "per_pool_omap"); db->submit_transaction_sync(txn); } void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid) { dout(1) << __func__ << " " << cid << " " << oid <lock }; // just to avoid internal asserts o = c->get_onode(oid, false); ceph_assert(o); } o->onode.clear_flag( bluestore_onode_t::FLAG_PERPG_OMAP | bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PGMETA_OMAP); txn = db->get_transaction(); _record_onode(o, txn); db->submit_transaction_sync(txn); } void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs) { BlueStoreRepairer repairer; repairer.fix_statfs(db, key, new_statfs); repairer.apply(db); } void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs) { KeyValueDB::Transaction t = db->get_transaction(); volatile_statfs v; v = new_statfs; bufferlist bl; v.encode(bl); t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); db->submit_transaction_sync(t); } void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1, coll_t cid2, ghobject_t oid2, uint64_t offset) { OnodeRef o1; CollectionRef c1 = _get_collection(cid1); ceph_assert(c1); { std::unique_lock l{c1->lock}; // just to avoid internal asserts o1 = c1->get_onode(oid1, false); ceph_assert(o1); o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE); } OnodeRef o2; CollectionRef c2 = _get_collection(cid2); ceph_assert(c2); { std::unique_lock l{c2->lock}; // just to avoid internal asserts o2 = c2->get_onode(oid2, false); ceph_assert(o2); o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE); } Extent& e1 = *(o1->extent_map.seek_lextent(offset)); Extent& e2 = *(o2->extent_map.seek_lextent(offset)); // require onode/extent layout to be the same (and simple) // to make things easier ceph_assert(o1->onode.extent_map_shards.empty()); ceph_assert(o2->onode.extent_map_shards.empty()); ceph_assert(o1->extent_map.spanning_blob_map.size() == 0); ceph_assert(o2->extent_map.spanning_blob_map.size() == 0); ceph_assert(e1.logical_offset == e2.logical_offset); ceph_assert(e1.length == e2.length); ceph_assert(e1.blob_offset == e2.blob_offset); KeyValueDB::Transaction txn; txn = db->get_transaction(); // along with misreference error this will create space leaks errors e2.blob->dirty_blob() = e1.blob->get_blob(); o2->extent_map.dirty_range(offset, e2.length); o2->extent_map.update(txn, false); _record_onode(o2, txn); db->submit_transaction_sync(txn); } void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, int16_t blob_id) { OnodeRef o; CollectionRef c = _get_collection(cid); ceph_assert(c); { std::unique_lock l{ c->lock }; // just to avoid internal asserts o = c->get_onode(oid, false); ceph_assert(o); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); } BlobRef b = c->new_blob(); b->id = blob_id; o->extent_map.spanning_blob_map[blob_id] = b; KeyValueDB::Transaction txn; txn = db->get_transaction(); _record_onode(o, txn); db->submit_transaction_sync(txn); } void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size) { ceph_assert(bluefs); BlueFS::FileWriter* p_handle = nullptr; auto ret = bluefs->open_for_write(dir, name, &p_handle, false); ceph_assert(ret == 0); std::string s('0', new_size); bufferlist bl; bl.append(s); p_handle->append(bl); bluefs->fsync(p_handle); bluefs->close_writer(p_handle); } void BlueStore::collect_metadata(map *pm) { dout(10) << __func__ << dendl; bdev->collect_metadata("bluestore_bdev_", pm); if (bluefs) { (*pm)["bluefs"] = "1"; // this value is for backward compatibility only (*pm)["bluefs_single_shared_device"] = \ stringify((int)bluefs_layout.single_shared_device()); (*pm)["bluefs_dedicated_db"] = \ stringify((int)bluefs_layout.dedicated_db); (*pm)["bluefs_dedicated_wal"] = \ stringify((int)bluefs_layout.dedicated_wal); bluefs->collect_metadata(pm, bluefs_layout.shared_bdev); } else { (*pm)["bluefs"] = "0"; } // report numa mapping for underlying devices int node = -1; set nodes; set failed; int r = get_numa_node(&node, &nodes, &failed); if (r >= 0) { if (!failed.empty()) { (*pm)["objectstore_numa_unknown_devices"] = stringify(failed); } if (!nodes.empty()) { dout(1) << __func__ << " devices span numa nodes " << nodes << dendl; (*pm)["objectstore_numa_nodes"] = stringify(nodes); } if (node >= 0) { (*pm)["objectstore_numa_node"] = stringify(node); } } } int BlueStore::get_numa_node( int *final_node, set *out_nodes, set *out_failed) { int node = -1; set devices; get_devices(&devices); set nodes; set failed; for (auto& devname : devices) { int n; BlkDev bdev(devname); int r = bdev.get_numa_node(&n); if (r < 0) { dout(10) << __func__ << " bdev " << devname << " can't detect numa_node" << dendl; failed.insert(devname); continue; } dout(10) << __func__ << " bdev " << devname << " on numa_node " << n << dendl; nodes.insert(n); if (node < 0) { node = n; } } if (node >= 0 && nodes.size() == 1 && failed.empty()) { *final_node = node; } if (out_nodes) { *out_nodes = nodes; } if (out_failed) { *out_failed = failed; } return 0; } int BlueStore::get_devices(set *ls) { if (bdev) { bdev->get_devices(ls); if (bluefs) { bluefs->get_devices(ls); } return 0; } // grumble, we haven't started up yet. int r = _open_path(); if (r < 0) goto out; r = _open_fsid(false); if (r < 0) goto out_path; r = _read_fsid(&fsid); if (r < 0) goto out_fsid; r = _lock_fsid(); if (r < 0) goto out_fsid; r = _open_bdev(false); if (r < 0) goto out_fsid; r = _minimal_open_bluefs(false); if (r < 0) goto out_bdev; bdev->get_devices(ls); if (bluefs) { bluefs->get_devices(ls); } r = 0; _minimal_close_bluefs(); out_bdev: _close_bdev(); out_fsid: _close_fsid(); out_path: _close_path(); out: return r; } void BlueStore::_get_statfs_overall(struct store_statfs_t *buf) { buf->reset(); auto prefix = per_pool_omap == OMAP_BULK ? PREFIX_OMAP : per_pool_omap == OMAP_PER_POOL ? PREFIX_PERPOOL_OMAP : PREFIX_PERPG_OMAP; buf->omap_allocated = db->estimate_prefix_size(prefix, string()); uint64_t bfree = shared_alloc.a->get_free(); if (bluefs) { buf->internally_reserved = 0; // include dedicated db, too, if that isn't the shared device. if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) { buf->total += bluefs->get_total(BlueFS::BDEV_DB); } // call any non-omap bluefs space "internal metadata" buf->internal_metadata = bluefs->get_used() - buf->omap_allocated; } uint64_t thin_total, thin_avail; if (bdev->get_thin_utilization(&thin_total, &thin_avail)) { buf->total += thin_total; // we are limited by both the size of the virtual device and the // underlying physical device. bfree = std::min(bfree, thin_avail); buf->allocated = thin_total - thin_avail; } else { buf->total += bdev->get_size(); } buf->available = bfree; } int BlueStore::statfs(struct store_statfs_t *buf, osd_alert_list_t* alerts) { if (alerts) { alerts->clear(); _log_alerts(*alerts); } _get_statfs_overall(buf); { std::lock_guard l(vstatfs_lock); buf->allocated = vstatfs.allocated(); buf->data_stored = vstatfs.stored(); buf->data_compressed = vstatfs.compressed(); buf->data_compressed_original = vstatfs.compressed_original(); buf->data_compressed_allocated = vstatfs.compressed_allocated(); } dout(20) << __func__ << " " << *buf << dendl; return 0; } int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, bool *out_per_pool_omap) { dout(20) << __func__ << " pool " << pool_id<< dendl; if (!per_pool_stat_collection) { dout(20) << __func__ << " not supported in legacy mode " << dendl; return -ENOTSUP; } buf->reset(); { std::lock_guard l(vstatfs_lock); osd_pools[pool_id].publish(buf); } string key_prefix; _key_encode_u64(pool_id, &key_prefix); *out_per_pool_omap = per_pool_omap != OMAP_BULK; if (*out_per_pool_omap) { auto prefix = per_pool_omap == OMAP_PER_POOL ? PREFIX_PERPOOL_OMAP : PREFIX_PERPG_OMAP; buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix); } dout(10) << __func__ << *buf << dendl; return 0; } void BlueStore::_check_legacy_statfs_alert() { string s; if (!per_pool_stat_collection && cct->_conf->bluestore_warn_on_legacy_statfs) { s = "legacy statfs reporting detected, " "suggest to run store repair to get consistent statistic reports"; } std::lock_guard l(qlock); legacy_statfs_alert = s; } void BlueStore::_check_no_per_pg_or_pool_omap_alert() { string per_pg, per_pool; if (per_pool_omap != OMAP_PER_PG) { if (cct->_conf->bluestore_warn_on_no_per_pg_omap) { per_pg = "legacy (not per-pg) omap detected, " "suggest to run store repair to benefit from faster PG removal"; } if (per_pool_omap != OMAP_PER_POOL) { if (cct->_conf->bluestore_warn_on_no_per_pool_omap) { per_pool = "legacy (not per-pool) omap detected, " "suggest to run store repair to benefit from per-pool omap usage statistics"; } } } std::lock_guard l(qlock); no_per_pg_omap_alert = per_pg; no_per_pool_omap_alert = per_pool; } // --------------- // cache BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid) { std::shared_lock l(coll_lock); ceph::unordered_map::iterator cp = coll_map.find(cid); if (cp == coll_map.end()) return CollectionRef(); return cp->second; } void BlueStore::_queue_reap_collection(CollectionRef& c) { dout(10) << __func__ << " " << c << " " << c->cid << dendl; // _reap_collections and this in the same thread, // so no need a lock. removed_collections.push_back(c); } void BlueStore::_reap_collections() { list removed_colls; { // _queue_reap_collection and this in the same thread. // So no need a lock. if (!removed_collections.empty()) removed_colls.swap(removed_collections); else return; } list::iterator p = removed_colls.begin(); while (p != removed_colls.end()) { CollectionRef c = *p; dout(10) << __func__ << " " << c << " " << c->cid << dendl; if (c->onode_map.map_any([&](Onode* o) { ceph_assert(!o->exists); if (o->flushing_count.load()) { dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid << " flush_txns " << o->flushing_count << dendl; return true; } return false; })) { ++p; continue; } c->onode_map.clear(); p = removed_colls.erase(p); dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl; } if (removed_colls.empty()) { dout(10) << __func__ << " all reaped" << dendl; } else { removed_collections.splice(removed_collections.begin(), removed_colls); } } void BlueStore::_update_cache_logger() { uint64_t num_onodes = 0; uint64_t num_pinned_onodes = 0; uint64_t num_extents = 0; uint64_t num_blobs = 0; uint64_t num_buffers = 0; uint64_t num_buffer_bytes = 0; for (auto c : onode_cache_shards) { c->add_stats(&num_onodes, &num_pinned_onodes); } for (auto c : buffer_cache_shards) { c->add_stats(&num_extents, &num_blobs, &num_buffers, &num_buffer_bytes); } logger->set(l_bluestore_onodes, num_onodes); logger->set(l_bluestore_pinned_onodes, num_pinned_onodes); logger->set(l_bluestore_extents, num_extents); logger->set(l_bluestore_blobs, num_blobs); logger->set(l_bluestore_buffers, num_buffers); logger->set(l_bluestore_buffer_bytes, num_buffer_bytes); } // --------------- // read operations ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid) { return _get_collection(cid); } ObjectStore::CollectionHandle BlueStore::create_new_collection( const coll_t& cid) { std::unique_lock l{coll_lock}; auto c = ceph::make_ref( this, onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())], buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())], cid); new_coll_map[cid] = c; _osr_attach(c.get()); return c; } void BlueStore::set_collection_commit_queue( const coll_t& cid, ContextQueue *commit_queue) { if (commit_queue) { std::shared_lock l(coll_lock); if (coll_map.count(cid)) { coll_map[cid]->commit_queue = commit_queue; } else if (new_coll_map.count(cid)) { new_coll_map[cid]->commit_queue = commit_queue; } } } bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid) { Collection *c = static_cast(c_.get()); dout(10) << __func__ << " " << c->cid << " " << oid << dendl; if (!c->exists) return false; bool r = true; { std::shared_lock l(c->lock); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) r = false; } return r; } int BlueStore::stat( CollectionHandle &c_, const ghobject_t& oid, struct stat *st, bool allow_eio) { Collection *c = static_cast(c_.get()); if (!c->exists) return -ENOENT; dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; { std::shared_lock l(c->lock); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) return -ENOENT; st->st_size = o->onode.size; st->st_blksize = 4096; st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; st->st_nlink = 1; } int r = 0; if (_debug_mdata_eio(oid)) { r = -EIO; derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; } return r; } int BlueStore::set_collection_opts( CollectionHandle& ch, const pool_opts_t& opts) { Collection *c = static_cast(ch.get()); dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl; if (!c->exists) return -ENOENT; std::unique_lock l{c->lock}; c->pool_opts = opts; return 0; } int BlueStore::read( CollectionHandle &c_, const ghobject_t& oid, uint64_t offset, size_t length, bufferlist& bl, uint32_t op_flags) { auto start = mono_clock::now(); Collection *c = static_cast(c_.get()); const coll_t &cid = c->get_cid(); dout(15) << __func__ << " " << cid << " " << oid << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; if (!c->exists) return -ENOENT; bl.clear(); int r; { std::shared_lock l(c->lock); auto start1 = mono_clock::now(); OnodeRef o = c->get_onode(oid, false); log_latency("get_onode@read", l_bluestore_read_onode_meta_lat, mono_clock::now() - start1, cct->_conf->bluestore_log_op_age); if (!o || !o->exists) { r = -ENOENT; goto out; } if (offset == length && offset == 0) length = o->onode.size; r = _do_read(c, o, offset, length, bl, op_flags); if (r == -EIO) { logger->inc(l_bluestore_read_eio); } } out: if (r >= 0 && _debug_data_eio(oid)) { r = -EIO; derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ cct->_conf->bluestore_debug_random_read_err && (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) { dout(0) << __func__ << ": inject random EIO" << dendl; r = -EIO; } dout(10) << __func__ << " " << cid << " " << oid << " 0x" << std::hex << offset << "~" << length << std::dec << " = " << r << dendl; log_latency(__func__, l_bluestore_read_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); return r; } void BlueStore::_read_cache( OnodeRef o, uint64_t offset, size_t length, int read_cache_policy, ready_regions_t& ready_regions, blobs2read_t& blobs2read) { // build blob-wise list to of stuff read (that isn't cached) unsigned left = length; uint64_t pos = offset; auto lp = o->extent_map.seek_lextent(offset); while (left > 0 && lp != o->extent_map.extent_map.end()) { if (pos < lp->logical_offset) { unsigned hole = lp->logical_offset - pos; if (hole >= left) { break; } dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole << std::dec << dendl; pos += hole; left -= hole; } BlobRef& bptr = lp->blob; unsigned l_off = pos - lp->logical_offset; unsigned b_off = l_off + lp->blob_offset; unsigned b_len = std::min(left, lp->length - l_off); ready_regions_t cache_res; interval_set cache_interval; bptr->shared_blob->bc.read( bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval, read_cache_policy); dout(20) << __func__ << " blob " << *bptr << std::hex << " need 0x" << b_off << "~" << b_len << " cache has 0x" << cache_interval << std::dec << dendl; auto pc = cache_res.begin(); uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size); while (b_len > 0) { unsigned l; if (pc != cache_res.end() && pc->first == b_off) { l = pc->second.length(); ready_regions[pos] = std::move(pc->second); dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x" << b_off << "~" << l << std::dec << dendl; ++pc; } else { l = b_len; if (pc != cache_res.end()) { ceph_assert(pc->first > b_off); l = pc->first - b_off; } dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" << b_off << "~" << l << std::dec << dendl; // merge regions { uint64_t r_off = b_off; uint64_t r_len = l; uint64_t front = r_off % chunk_size; if (front) { r_off -= front; r_len += front; } unsigned tail = r_len % chunk_size; if (tail) { r_len += chunk_size - tail; } bool merged = false; regions2read_t& r2r = blobs2read[bptr]; if (r2r.size()) { read_req_t& pre = r2r.back(); if (r_off <= (pre.r_off + pre.r_len)) { front += (r_off - pre.r_off); pre.r_len += (r_off + r_len - pre.r_off - pre.r_len); pre.regs.emplace_back(region_t(pos, b_off, l, front)); merged = true; } } if (!merged) { read_req_t req(r_off, r_len); req.regs.emplace_back(region_t(pos, b_off, l, front)); r2r.emplace_back(std::move(req)); } } } pos += l; b_off += l; left -= l; b_len -= l; } ++lp; } } int BlueStore::_prepare_read_ioc( blobs2read_t& blobs2read, vector* compressed_blob_bls, IOContext* ioc) { for (auto& p : blobs2read) { const BlobRef& bptr = p.first; regions2read_t& r2r = p.second; dout(20) << __func__ << " blob " << *bptr << std::hex << " need " << r2r << std::dec << dendl; if (bptr->get_blob().is_compressed()) { // read the whole thing if (compressed_blob_bls->empty()) { // ensure we avoid any reallocation on subsequent blobs compressed_blob_bls->reserve(blobs2read.size()); } compressed_blob_bls->push_back(bufferlist()); bufferlist& bl = compressed_blob_bls->back(); auto r = bptr->get_blob().map( 0, bptr->get_blob().get_ondisk_length(), [&](uint64_t offset, uint64_t length) { int r = bdev->aio_read(offset, length, &bl, ioc); if (r < 0) return r; return 0; }); if (r < 0) { derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; if (r == -EIO) { // propagate EIO to caller return r; } ceph_assert(r == 0); } } else { // read the pieces for (auto& req : r2r) { dout(20) << __func__ << " region 0x" << std::hex << req.regs.front().logical_offset << ": 0x" << req.regs.front().blob_xoffset << " reading 0x" << req.r_off << "~" << req.r_len << std::dec << dendl; // read it auto r = bptr->get_blob().map( req.r_off, req.r_len, [&](uint64_t offset, uint64_t length) { int r = bdev->aio_read(offset, length, &req.bl, ioc); if (r < 0) return r; return 0; }); if (r < 0) { derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; if (r == -EIO) { // propagate EIO to caller return r; } ceph_assert(r == 0); } ceph_assert(req.bl.length() == req.r_len); } } } return 0; } int BlueStore::_generate_read_result_bl( OnodeRef o, uint64_t offset, size_t length, ready_regions_t& ready_regions, vector& compressed_blob_bls, blobs2read_t& blobs2read, bool buffered, bool* csum_error, bufferlist& bl) { // enumerate and decompress desired blobs auto p = compressed_blob_bls.begin(); blobs2read_t::iterator b2r_it = blobs2read.begin(); while (b2r_it != blobs2read.end()) { const BlobRef& bptr = b2r_it->first; regions2read_t& r2r = b2r_it->second; dout(20) << __func__ << " blob " << *bptr << std::hex << " need 0x" << r2r << std::dec << dendl; if (bptr->get_blob().is_compressed()) { ceph_assert(p != compressed_blob_bls.end()); bufferlist& compressed_bl = *p++; if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, r2r.front().regs.front().logical_offset) < 0) { *csum_error = true; return -EIO; } bufferlist raw_bl; auto r = _decompress(compressed_bl, &raw_bl); if (r < 0) return r; if (buffered) { bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, raw_bl); } for (auto& req : r2r) { for (auto& r : req.regs) { ready_regions[r.logical_offset].substr_of( raw_bl, r.blob_xoffset, r.length); } } } else { for (auto& req : r2r) { if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, req.regs.front().logical_offset) < 0) { *csum_error = true; return -EIO; } if (buffered) { bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), req.r_off, req.bl); } // prune and keep result for (const auto& r : req.regs) { ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); } } } ++b2r_it; } // generate a resulting buffer auto pr = ready_regions.begin(); auto pr_end = ready_regions.end(); uint64_t pos = 0; while (pos < length) { if (pr != pr_end && pr->first == pos + offset) { dout(30) << __func__ << " assemble 0x" << std::hex << pos << ": data from 0x" << pr->first << "~" << pr->second.length() << std::dec << dendl; pos += pr->second.length(); bl.claim_append(pr->second); ++pr; } else { uint64_t l = length - pos; if (pr != pr_end) { ceph_assert(pr->first > pos + offset); l = pr->first - (pos + offset); } dout(30) << __func__ << " assemble 0x" << std::hex << pos << ": zeros for 0x" << (pos + offset) << "~" << l << std::dec << dendl; bl.append_zero(l); pos += l; } } ceph_assert(bl.length() == length); ceph_assert(pos == length); ceph_assert(pr == pr_end); return 0; } int BlueStore::_do_read( Collection *c, OnodeRef o, uint64_t offset, size_t length, bufferlist& bl, uint32_t op_flags, uint64_t retry_count) { FUNCTRACE(cct); int r = 0; int read_cache_policy = 0; // do not bypass clean or dirty cache dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << " size 0x" << o->onode.size << " (" << std::dec << o->onode.size << ")" << dendl; bl.clear(); if (offset >= o->onode.size) { return r; } // generally, don't buffer anything, unless the client explicitly requests // it. bool buffered = false; if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { dout(20) << __func__ << " will do buffered read" << dendl; buffered = true; } else if (cct->_conf->bluestore_default_buffered_read && (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { dout(20) << __func__ << " defaulting to buffered read" << dendl; buffered = true; } if (offset + length > o->onode.size) { length = o->onode.size - offset; } auto start = mono_clock::now(); o->extent_map.fault_range(db, offset, length); log_latency(__func__, l_bluestore_read_onode_meta_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); _dump_onode<30>(cct, *o); // for deep-scrub, we only read dirty cache and bypass clean cache in // order to read underlying block device in case there are silent disk errors. if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { dout(20) << __func__ << " will bypass cache and do direct read" << dendl; read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; } // build blob-wise list to of stuff read (that isn't cached) ready_regions_t ready_regions; blobs2read_t blobs2read; _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read); // read raw blob data. start = mono_clock::now(); // for the sake of simplicity // measure the whole block below. // The error isn't that much... vector compressed_blob_bls; IOContext ioc(cct, NULL, true); // allow EIO r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc); // we always issue aio for reading, so errors other than EIO are not allowed if (r < 0) return r; int64_t num_ios = blobs2read.size(); if (ioc.has_pending_aios()) { num_ios = ioc.get_num_ios(); bdev->aio_submit(&ioc); dout(20) << __func__ << " waiting for aio" << dendl; ioc.aio_wait(); r = ioc.get_return_value(); if (r < 0) { ceph_assert(r == -EIO); // no other errors allowed return -EIO; } } log_latency_fn(__func__, l_bluestore_read_wait_aio_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age, [&](auto lat) { return ", num_ios = " + stringify(num_ios); } ); bool csum_error = false; r = _generate_read_result_bl(o, offset, length, ready_regions, compressed_blob_bls, blobs2read, buffered, &csum_error, bl); if (csum_error) { // Handles spurious read errors caused by a kernel bug. // We sometimes get all-zero pages as a result of the read under // high memory pressure. Retrying the failing read succeeds in most // cases. // See also: http://tracker.ceph.com/issues/22464 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { return -EIO; } return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); } r = bl.length(); if (retry_count) { logger->inc(l_bluestore_reads_with_retries); dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length << " failed " << std::dec << retry_count << " times before succeeding" << dendl; stringstream s; s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries); _set_spurious_read_errors_alert(s.str()); } return r; } int BlueStore::_verify_csum(OnodeRef& o, const bluestore_blob_t* blob, uint64_t blob_xoffset, const bufferlist& bl, uint64_t logical_offset) const { int bad; uint64_t bad_csum; auto start = mono_clock::now(); int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum); if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 && (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) { derr << __func__ << " injecting bluestore checksum verifcation error" << dendl; bad = blob_xoffset; r = -1; bad_csum = 0xDEADBEEF; } if (r < 0) { if (r == -1) { PExtentVector pex; blob->map( bad, blob->get_csum_chunk_size(), [&](uint64_t offset, uint64_t length) { pex.emplace_back(bluestore_pextent_t(offset, length)); return 0; }); derr << __func__ << " bad " << Checksummer::get_csum_type_string(blob->csum_type) << "/0x" << std::hex << blob->get_csum_chunk_size() << " checksum at blob offset 0x" << bad << ", got 0x" << bad_csum << ", expected 0x" << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec << ", device location " << pex << ", logical extent 0x" << std::hex << (logical_offset + bad - blob_xoffset) << "~" << blob->get_csum_chunk_size() << std::dec << ", object " << o->oid << dendl; } else { derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl; } } log_latency(__func__, l_bluestore_csum_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); if (cct->_conf->bluestore_ignore_data_csum) { return 0; } return r; } int BlueStore::_decompress(bufferlist& source, bufferlist* result) { int r = 0; auto start = mono_clock::now(); auto i = source.cbegin(); bluestore_compression_header_t chdr; decode(chdr, i); int alg = int(chdr.type); CompressorRef cp = compressor; if (!cp || (int)cp->get_type() != alg) { cp = Compressor::create(cct, alg); } if (!cp.get()) { // if compressor isn't available - error, because cannot return // decompressed data? const char* alg_name = Compressor::get_comp_alg_name(alg); derr << __func__ << " can't load decompressor " << alg_name << dendl; _set_compression_alert(false, alg_name); r = -EIO; } else { r = cp->decompress(i, chdr.length, *result, chdr.compressor_message); if (r < 0) { derr << __func__ << " decompression failed with exit code " << r << dendl; r = -EIO; } } log_latency(__func__, l_bluestore_decompress_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); return r; } // this stores fiemap into interval_set, other variations // use it internally int BlueStore::_fiemap( CollectionHandle &c_, const ghobject_t& oid, uint64_t offset, size_t length, interval_set& destset) { Collection *c = static_cast(c_.get()); if (!c->exists) return -ENOENT; { std::shared_lock l(c->lock); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { return -ENOENT; } _dump_onode<30>(cct, *o); dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << " size 0x" << o->onode.size << std::dec << dendl; boost::intrusive::set::iterator ep, eend; if (offset >= o->onode.size) goto out; if (offset + length > o->onode.size) { length = o->onode.size - offset; } o->extent_map.fault_range(db, offset, length); eend = o->extent_map.extent_map.end(); ep = o->extent_map.seek_lextent(offset); while (length > 0) { dout(20) << __func__ << " offset " << offset << dendl; if (ep != eend && ep->logical_offset + ep->length <= offset) { ++ep; continue; } uint64_t x_len = length; if (ep != eend && ep->logical_offset <= offset) { uint64_t x_off = offset - ep->logical_offset; x_len = std::min(x_len, ep->length - x_off); dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~" << x_len << std::dec << " blob " << ep->blob << dendl; destset.insert(offset, x_len); length -= x_len; offset += x_len; if (x_off + x_len == ep->length) ++ep; continue; } if (ep != eend && ep->logical_offset > offset && ep->logical_offset - offset < x_len) { x_len = ep->logical_offset - offset; } offset += x_len; length -= x_len; } } out: dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << " size = 0x(" << destset << ")" << std::dec << dendl; return 0; } int BlueStore::fiemap( CollectionHandle &c_, const ghobject_t& oid, uint64_t offset, size_t length, bufferlist& bl) { interval_set m; int r = _fiemap(c_, oid, offset, length, m); if (r >= 0) { encode(m, bl); } return r; } int BlueStore::fiemap( CollectionHandle &c_, const ghobject_t& oid, uint64_t offset, size_t length, map& destmap) { interval_set m; int r = _fiemap(c_, oid, offset, length, m); if (r >= 0) { destmap = std::move(m).detach(); } return r; } int BlueStore::readv( CollectionHandle &c_, const ghobject_t& oid, interval_set& m, bufferlist& bl, uint32_t op_flags) { auto start = mono_clock::now(); Collection *c = static_cast(c_.get()); const coll_t &cid = c->get_cid(); dout(15) << __func__ << " " << cid << " " << oid << " fiemap " << m << dendl; if (!c->exists) return -ENOENT; bl.clear(); int r; { std::shared_lock l(c->lock); auto start1 = mono_clock::now(); OnodeRef o = c->get_onode(oid, false); log_latency("get_onode@read", l_bluestore_read_onode_meta_lat, mono_clock::now() - start1, cct->_conf->bluestore_log_op_age); if (!o || !o->exists) { r = -ENOENT; goto out; } if (m.empty()) { r = 0; goto out; } r = _do_readv(c, o, m, bl, op_flags); if (r == -EIO) { logger->inc(l_bluestore_read_eio); } } out: if (r >= 0 && _debug_data_eio(oid)) { r = -EIO; derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ cct->_conf->bluestore_debug_random_read_err && (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) { dout(0) << __func__ << ": inject random EIO" << dendl; r = -EIO; } dout(10) << __func__ << " " << cid << " " << oid << " fiemap " << m << std::dec << " = " << r << dendl; log_latency(__func__, l_bluestore_read_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); return r; } int BlueStore::_do_readv( Collection *c, OnodeRef o, const interval_set& m, bufferlist& bl, uint32_t op_flags, uint64_t retry_count) { FUNCTRACE(cct); int r = 0; int read_cache_policy = 0; // do not bypass clean or dirty cache dout(20) << __func__ << " fiemap " << m << std::hex << " size 0x" << o->onode.size << " (" << std::dec << o->onode.size << ")" << dendl; // generally, don't buffer anything, unless the client explicitly requests // it. bool buffered = false; if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { dout(20) << __func__ << " will do buffered read" << dendl; buffered = true; } else if (cct->_conf->bluestore_default_buffered_read && (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { dout(20) << __func__ << " defaulting to buffered read" << dendl; buffered = true; } // this method must be idempotent since we may call it several times // before we finally read the expected result. bl.clear(); // call fiemap first! ceph_assert(m.range_start() <= o->onode.size); ceph_assert(m.range_end() <= o->onode.size); auto start = mono_clock::now(); o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start()); log_latency(__func__, l_bluestore_read_onode_meta_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); _dump_onode<30>(cct, *o); IOContext ioc(cct, NULL, true); // allow EIO vector, blobs2read_t>> raw_results; raw_results.reserve(m.num_intervals()); int i = 0; for (auto p = m.begin(); p != m.end(); p++, i++) { raw_results.push_back({}); _read_cache(o, p.get_start(), p.get_len(), read_cache_policy, std::get<0>(raw_results[i]), std::get<2>(raw_results[i])); r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc); // we always issue aio for reading, so errors other than EIO are not allowed if (r < 0) return r; } auto num_ios = m.size(); if (ioc.has_pending_aios()) { num_ios = ioc.get_num_ios(); bdev->aio_submit(&ioc); dout(20) << __func__ << " waiting for aio" << dendl; ioc.aio_wait(); r = ioc.get_return_value(); if (r < 0) { ceph_assert(r == -EIO); // no other errors allowed return -EIO; } } log_latency_fn(__func__, l_bluestore_read_wait_aio_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age, [&](auto lat) { return ", num_ios = " + stringify(num_ios); } ); ceph_assert(raw_results.size() == (size_t)m.num_intervals()); i = 0; for (auto p = m.begin(); p != m.end(); p++, i++) { bool csum_error = false; bufferlist t; r = _generate_read_result_bl(o, p.get_start(), p.get_len(), std::get<0>(raw_results[i]), std::get<1>(raw_results[i]), std::get<2>(raw_results[i]), buffered, &csum_error, t); if (csum_error) { // Handles spurious read errors caused by a kernel bug. // We sometimes get all-zero pages as a result of the read under // high memory pressure. Retrying the failing read succeeds in most // cases. // See also: http://tracker.ceph.com/issues/22464 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { return -EIO; } return _do_readv(c, o, m, bl, op_flags, retry_count + 1); } bl.claim_append(t); } if (retry_count) { logger->inc(l_bluestore_reads_with_retries); dout(5) << __func__ << " read fiemap " << m << " failed " << retry_count << " times before succeeding" << dendl; } return bl.length(); } int BlueStore::dump_onode(CollectionHandle &c_, const ghobject_t& oid, const string& section_name, Formatter *f) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->cid << " " << oid << dendl; if (!c->exists) return -ENOENT; int r; { std::shared_lock l(c->lock); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } // FIXME minor: actually the next line isn't enough to // load shared blobs. Leaving as is for now.. // o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); _dump_onode<0>(cct, *o); f->open_object_section(section_name.c_str()); o->dump(f); f->close_section(); r = 0; } out: dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; return r; } int BlueStore::getattr( CollectionHandle &c_, const ghobject_t& oid, const char *name, bufferptr& value) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl; if (!c->exists) return -ENOENT; int r; { std::shared_lock l(c->lock); mempool::bluestore_cache_meta::string k(name); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.attrs.count(k)) { r = -ENODATA; goto out; } value = o->onode.attrs[k]; r = 0; } out: if (r == 0 && _debug_mdata_eio(oid)) { r = -EIO; derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; } dout(10) << __func__ << " " << c->cid << " " << oid << " " << name << " = " << r << dendl; return r; } int BlueStore::getattrs( CollectionHandle &c_, const ghobject_t& oid, map& aset) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->cid << " " << oid << dendl; if (!c->exists) return -ENOENT; int r; { std::shared_lock l(c->lock); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } for (auto& i : o->onode.attrs) { aset.emplace(i.first.c_str(), i.second); } r = 0; } out: if (r == 0 && _debug_mdata_eio(oid)) { r = -EIO; derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; } dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; return r; } int BlueStore::list_collections(vector& ls) { std::shared_lock l(coll_lock); ls.reserve(coll_map.size()); for (ceph::unordered_map::iterator p = coll_map.begin(); p != coll_map.end(); ++p) ls.push_back(p->first); return 0; } bool BlueStore::collection_exists(const coll_t& c) { std::shared_lock l(coll_lock); return coll_map.count(c); } int BlueStore::collection_empty(CollectionHandle& ch, bool *empty) { dout(15) << __func__ << " " << ch->cid << dendl; vector ls; ghobject_t next; int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1, &ls, &next); if (r < 0) { derr << __func__ << " collection_list returned: " << cpp_strerror(r) << dendl; return r; } *empty = ls.empty(); dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl; return 0; } int BlueStore::collection_bits(CollectionHandle& ch) { dout(15) << __func__ << " " << ch->cid << dendl; Collection *c = static_cast(ch.get()); std::shared_lock l(c->lock); dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl; return c->cnode.bits; } int BlueStore::collection_list( CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, vector *ls, ghobject_t *pnext) { Collection *c = static_cast(c_.get()); c->flush(); dout(15) << __func__ << " " << c->cid << " start " << start << " end " << end << " max " << max << dendl; int r; { std::shared_lock l(c->lock); r = _collection_list(c, start, end, max, false, ls, pnext); } dout(10) << __func__ << " " << c->cid << " start " << start << " end " << end << " max " << max << " = " << r << ", ls.size() = " << ls->size() << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; return r; } int BlueStore::collection_list_legacy( CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, vector *ls, ghobject_t *pnext) { Collection *c = static_cast(c_.get()); c->flush(); dout(15) << __func__ << " " << c->cid << " start " << start << " end " << end << " max " << max << dendl; int r; { std::shared_lock l(c->lock); r = _collection_list(c, start, end, max, true, ls, pnext); } dout(10) << __func__ << " " << c->cid << " start " << start << " end " << end << " max " << max << " = " << r << ", ls.size() = " << ls->size() << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; return r; } int BlueStore::_collection_list( Collection *c, const ghobject_t& start, const ghobject_t& end, int max, bool legacy, vector *ls, ghobject_t *pnext) { if (!c->exists) return -ENOENT; ghobject_t static_next; std::unique_ptr it; ghobject_t coll_range_temp_start, coll_range_temp_end; ghobject_t coll_range_start, coll_range_end; ghobject_t pend; bool temp; if (!pnext) pnext = &static_next; auto log_latency = make_scope_guard( [&, start_time = mono_clock::now(), func_name = __func__] { log_latency_fn( func_name, l_bluestore_remove_lat, mono_clock::now() - start_time, cct->_conf->bluestore_log_collection_list_age, [&](const ceph::timespan& lat) { ostringstream ostr; ostr << ", lat = " << timespan_str(lat) << " cid =" << c->cid << " start " << start << " end " << end << " max " << max; return ostr.str(); }); }); if (start.is_max() || start.hobj.is_max()) { *pnext = ghobject_t::get_max(); return 0; } get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start, &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy); dout(20) << __func__ << " range " << coll_range_temp_start << " to " << coll_range_temp_end << " and " << coll_range_start << " to " << coll_range_end << " start " << start << dendl; if (legacy) { it = std::make_unique( cct, db->get_iterator(PREFIX_OBJ)); } else { it = std::make_unique( db->get_iterator(PREFIX_OBJ)); } if (start == ghobject_t() || start.hobj == hobject_t() || start == c->cid.get_min_hobj()) { it->upper_bound(coll_range_temp_start); temp = true; } else { if (start.hobj.is_temp()) { temp = true; ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end); } else { temp = false; ceph_assert(start >= coll_range_start && start < coll_range_end); } dout(20) << __func__ << " temp=" << (int)temp << dendl; it->lower_bound(start); } if (end.hobj.is_max()) { pend = temp ? coll_range_temp_end : coll_range_end; } else { if (end.hobj.is_temp()) { if (temp) { pend = end; } else { *pnext = ghobject_t::get_max(); return 0; } } else { pend = temp ? coll_range_temp_end : end; } } dout(20) << __func__ << " pend " << pend << dendl; while (true) { if (!it->valid() || it->is_ge(pend)) { if (!it->valid()) dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; else dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; if (temp) { if (end.hobj.is_temp()) { if (it->valid() && it->is_lt(coll_range_temp_end)) { *pnext = it->oid(); return 0; } break; } dout(30) << __func__ << " switch to non-temp namespace" << dendl; temp = false; it->upper_bound(coll_range_start); if (end.hobj.is_max()) pend = coll_range_end; else pend = end; dout(30) << __func__ << " pend " << pend << dendl; continue; } if (it->valid() && it->is_lt(coll_range_end)) { *pnext = it->oid(); return 0; } break; } dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl; if (ls->size() >= (unsigned)max) { dout(20) << __func__ << " reached max " << max << dendl; *pnext = it->oid(); return 0; } ls->push_back(it->oid()); it->next(); } *pnext = ghobject_t::get_max(); return 0; } int BlueStore::omap_get( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header map *out /// < [out] Key to value map ) { Collection *c = static_cast(c_.get()); return _omap_get(c, oid, header, out); } int BlueStore::_omap_get( Collection *c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header map *out /// < [out] Key to value map ) { dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; if (!c->exists) return -ENOENT; std::shared_lock l(c->lock); int r = 0; OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } r = _onode_omap_get(o, header, out); out: dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r << dendl; return r; } int BlueStore::_onode_omap_get( const OnodeRef &o, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header map *out /// < [out] Key to value map ) { int r = 0; if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.has_omap()) goto out; o->flush(); { const string& prefix = o->get_omap_prefix(); string head, tail; o->get_omap_header(&head); o->get_omap_tail(&tail); KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail}); it->lower_bound(head); while (it->valid()) { if (it->key() == head) { dout(30) << __func__ << " got header" << dendl; *header = it->value(); } else if (it->key() >= tail) { dout(30) << __func__ << " reached tail" << dendl; break; } else { string user_key; o->decode_omap_key(it->key(), &user_key); dout(20) << __func__ << " got " << pretty_binary_string(it->key()) << " -> " << user_key << dendl; (*out)[user_key] = it->value(); } it->next(); } } out: return r; } int BlueStore::omap_get_header( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header bool allow_eio ///< [in] don't assert on eio ) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; if (!c->exists) return -ENOENT; std::shared_lock l(c->lock); int r = 0; OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.has_omap()) goto out; o->flush(); { string head; o->get_omap_header(&head); if (db->get(o->get_omap_prefix(), head, header) >= 0) { dout(30) << __func__ << " got header" << dendl; } else { dout(30) << __func__ << " no header" << dendl; } } out: dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r << dendl; return r; } int BlueStore::omap_get_keys( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap set *keys ///< [out] Keys defined on oid ) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; if (!c->exists) return -ENOENT; auto start1 = mono_clock::now(); std::shared_lock l(c->lock); int r = 0; OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.has_omap()) goto out; o->flush(); { const string& prefix = o->get_omap_prefix(); string head, tail; o->get_omap_key(string(), &head); o->get_omap_tail(&tail); KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail}); it->lower_bound(head); while (it->valid()) { if (it->key() >= tail) { dout(30) << __func__ << " reached tail" << dendl; break; } string user_key; o->decode_omap_key(it->key(), &user_key); dout(20) << __func__ << " got " << pretty_binary_string(it->key()) << " -> " << user_key << dendl; keys->insert(user_key); it->next(); } } out: c->store->log_latency( __func__, l_bluestore_omap_get_keys_lat, mono_clock::now() - start1, c->store->cct->_conf->bluestore_log_omap_iterator_age); dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r << dendl; return r; } int BlueStore::omap_get_values( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const set &keys, ///< [in] Keys to get map *out ///< [out] Returned keys and values ) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; if (!c->exists) return -ENOENT; std::shared_lock l(c->lock); auto start1 = mono_clock::now(); int r = 0; string final_key; OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.has_omap()) { goto out; } o->flush(); { const string& prefix = o->get_omap_prefix(); o->get_omap_key(string(), &final_key); size_t base_key_len = final_key.size(); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { final_key.resize(base_key_len); // keep prefix final_key += *p; bufferlist val; if (db->get(prefix, final_key, &val) >= 0) { dout(30) << __func__ << " got " << pretty_binary_string(final_key) << " -> " << *p << dendl; out->insert(make_pair(*p, val)); } } } out: c->store->log_latency( __func__, l_bluestore_omap_get_values_lat, mono_clock::now() - start1, c->store->cct->_conf->bluestore_log_omap_iterator_age); dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r << dendl; return r; } #ifdef WITH_SEASTAR int BlueStore::omap_get_values( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const std::optional &start_after, ///< [in] Keys to get map *output ///< [out] Returned keys and values ) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; if (!c->exists) return -ENOENT; std::shared_lock l(c->lock); int r = 0; OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.has_omap()) { goto out; } o->flush(); { ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid); if (!iter) { r = -ENOENT; goto out; } iter->upper_bound(*start_after); for (; iter->valid(); iter->next()) { output->insert(make_pair(iter->key(), iter->value())); } } out: dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r << dendl; return r; } #endif int BlueStore::omap_check_keys( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const set &keys, ///< [in] Keys to check set *out ///< [out] Subset of keys defined on oid ) { Collection *c = static_cast(c_.get()); dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; if (!c->exists) return -ENOENT; std::shared_lock l(c->lock); int r = 0; string final_key; OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { r = -ENOENT; goto out; } if (!o->onode.has_omap()) { goto out; } o->flush(); { const string& prefix = o->get_omap_prefix(); o->get_omap_key(string(), &final_key); size_t base_key_len = final_key.size(); for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { final_key.resize(base_key_len); // keep prefix final_key += *p; bufferlist val; if (db->get(prefix, final_key, &val) >= 0) { dout(30) << __func__ << " have " << pretty_binary_string(final_key) << " -> " << *p << dendl; out->insert(*p); } else { dout(30) << __func__ << " miss " << pretty_binary_string(final_key) << " -> " << *p << dendl; } } } out: dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r << dendl; return r; } ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( CollectionHandle &c_, ///< [in] collection const ghobject_t &oid ///< [in] object ) { Collection *c = static_cast(c_.get()); dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; if (!c->exists) { return ObjectMap::ObjectMapIterator(); } std::shared_lock l(c->lock); OnodeRef o = c->get_onode(oid, false); if (!o || !o->exists) { dout(10) << __func__ << " " << oid << "doesn't exist" <flush(); dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <onode.has_omap()) { std::string lower_bound, upper_bound; o->get_omap_key(string(), &lower_bound); o->get_omap_tail(&upper_bound); bounds.lower_bound = std::move(lower_bound); bounds.upper_bound = std::move(upper_bound); } KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds)); return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); } // ----------------- // write helpers uint64_t BlueStore::_get_ondisk_reserved() const { ceph_assert(min_alloc_size); return round_up_to( std::max(SUPER_RESERVED, min_alloc_size), min_alloc_size); } void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t) { dout(10) << __func__ << " ondisk_format " << ondisk_format << " min_compat_ondisk_format " << min_compat_ondisk_format << dendl; ceph_assert(ondisk_format == latest_ondisk_format); { bufferlist bl; encode(ondisk_format, bl); t->set(PREFIX_SUPER, "ondisk_format", bl); } { bufferlist bl; encode(min_compat_ondisk_format, bl); t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl); } } int BlueStore::_open_super_meta() { // nid { nid_max = 0; bufferlist bl; db->get(PREFIX_SUPER, "nid_max", &bl); auto p = bl.cbegin(); try { uint64_t v; decode(v, p); nid_max = v; } catch (ceph::buffer::error& e) { derr << __func__ << " unable to read nid_max" << dendl; return -EIO; } dout(1) << __func__ << " old nid_max " << nid_max << dendl; nid_last = nid_max.load(); } // blobid { blobid_max = 0; bufferlist bl; db->get(PREFIX_SUPER, "blobid_max", &bl); auto p = bl.cbegin(); try { uint64_t v; decode(v, p); blobid_max = v; } catch (ceph::buffer::error& e) { derr << __func__ << " unable to read blobid_max" << dendl; return -EIO; } dout(1) << __func__ << " old blobid_max " << blobid_max << dendl; blobid_last = blobid_max.load(); } // freelist { bufferlist bl; db->get(PREFIX_SUPER, "freelist_type", &bl); if (bl.length()) { freelist_type = std::string(bl.c_str(), bl.length()); dout(1) << __func__ << " freelist_type " << freelist_type << dendl; } else { ceph_abort_msg("Not Support extent freelist manager"); } } // ondisk format int32_t compat_ondisk_format = 0; { bufferlist bl; int r = db->get(PREFIX_SUPER, "ondisk_format", &bl); if (r < 0) { // base case: kraken bluestore is v1 and readable by v1 dout(20) << __func__ << " missing ondisk_format; assuming kraken" << dendl; ondisk_format = 1; compat_ondisk_format = 1; } else { auto p = bl.cbegin(); try { decode(ondisk_format, p); } catch (ceph::buffer::error& e) { derr << __func__ << " unable to read ondisk_format" << dendl; return -EIO; } bl.clear(); { r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl); ceph_assert(!r); auto p = bl.cbegin(); try { decode(compat_ondisk_format, p); } catch (ceph::buffer::error& e) { derr << __func__ << " unable to read compat_ondisk_format" << dendl; return -EIO; } } } dout(1) << __func__ << " ondisk_format " << ondisk_format << " compat_ondisk_format " << compat_ondisk_format << dendl; } if (latest_ondisk_format < compat_ondisk_format) { derr << __func__ << " compat_ondisk_format is " << compat_ondisk_format << " but we only understand version " << latest_ondisk_format << dendl; return -EPERM; } { bufferlist bl; db->get(PREFIX_SUPER, "min_alloc_size", &bl); auto p = bl.cbegin(); try { uint64_t val; decode(val, p); min_alloc_size = val; min_alloc_size_order = ctz(val); ceph_assert(min_alloc_size == 1u << min_alloc_size_order); } catch (ceph::buffer::error& e) { derr << __func__ << " unable to read min_alloc_size" << dendl; return -EIO; } dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size << std::dec << dendl; } _set_per_pool_omap(); _open_statfs(); _set_alloc_sizes(); _set_throttle_params(); _set_csum(); _set_compression(); _set_blob_size(); _validate_bdev(); return 0; } int BlueStore::_upgrade_super() { dout(1) << __func__ << " from " << ondisk_format << ", latest " << latest_ondisk_format << dendl; if (ondisk_format < latest_ondisk_format) { ceph_assert(ondisk_format > 0); ceph_assert(ondisk_format < latest_ondisk_format); KeyValueDB::Transaction t = db->get_transaction(); if (ondisk_format == 1) { // changes: // - super: added ondisk_format // - super: added min_readable_ondisk_format // - super: added min_compat_ondisk_format // - super: added min_alloc_size // - super: removed min_min_alloc_size { bufferlist bl; db->get(PREFIX_SUPER, "min_min_alloc_size", &bl); auto p = bl.cbegin(); try { uint64_t val; decode(val, p); min_alloc_size = val; } catch (ceph::buffer::error& e) { derr << __func__ << " failed to read min_min_alloc_size" << dendl; return -EIO; } t->set(PREFIX_SUPER, "min_alloc_size", bl); t->rmkey(PREFIX_SUPER, "min_min_alloc_size"); } ondisk_format = 2; } if (ondisk_format == 2) { // changes: // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all* // oondes are using the per-pool prefix until a repair is run; at that // point the per_pool_omap=1 key will be set. // - super: added per_pool_omap key, which indicates that *all* objects // are using the new prefix and key format ondisk_format = 3; } if (ondisk_format == 3) { // changes: // - FreelistManager keeps meta within bdev label int r = _write_out_fm_meta(0); ceph_assert(r == 0); ondisk_format = 4; } // This to be the last operation _prepare_ondisk_format_super(t); int r = db->submit_transaction_sync(t); ceph_assert(r == 0); } // done dout(1) << __func__ << " done" << dendl; return 0; } void BlueStore::_assign_nid(TransContext *txc, OnodeRef o) { if (o->onode.nid) { ceph_assert(o->exists); return; } uint64_t nid = ++nid_last; dout(20) << __func__ << " " << nid << dendl; o->onode.nid = nid; txc->last_nid = nid; o->exists = true; } uint64_t BlueStore::_assign_blobid(TransContext *txc) { uint64_t bid = ++blobid_last; dout(20) << __func__ << " " << bid << dendl; txc->last_blobid = bid; return bid; } void BlueStore::get_db_statistics(Formatter *f) { db->get_statistics(f); } BlueStore::TransContext *BlueStore::_txc_create( Collection *c, OpSequencer *osr, list *on_commits, TrackedOpRef osd_op) { TransContext *txc = new TransContext(cct, c, osr, on_commits); txc->t = db->get_transaction(); #ifdef WITH_BLKIN if (osd_op && osd_op->pg_trace) { txc->trace.init("TransContext", &trace_endpoint, &osd_op->pg_trace); txc->trace.event("txc create"); txc->trace.keyval("txc seq", txc->seq); } #endif osr->queue_new(txc); dout(20) << __func__ << " osr " << osr << " = " << txc << " seq " << txc->seq << dendl; return txc; } void BlueStore::_txc_calc_cost(TransContext *txc) { // one "io" for the kv commit auto ios = 1 + txc->ioc.get_num_ios(); auto cost = throttle_cost_per_io.load(); txc->cost = ios * cost + txc->bytes; txc->ios = ios; dout(10) << __func__ << " " << txc << " cost " << txc->cost << " (" << ios << " ios * " << cost << " + " << txc->bytes << " bytes)" << dendl; } void BlueStore::_txc_update_store_statfs(TransContext *txc) { if (txc->statfs_delta.is_empty()) return; logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated()); logger->inc(l_bluestore_stored, txc->statfs_delta.stored()); logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed()); logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated()); logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original()); bufferlist bl; txc->statfs_delta.encode(bl); if (per_pool_stat_collection) { string key; get_pool_stat_key(txc->osd_pool_id, &key); txc->t->merge(PREFIX_STAT, key, bl); std::lock_guard l(vstatfs_lock); auto& stats = osd_pools[txc->osd_pool_id]; stats += txc->statfs_delta; vstatfs += txc->statfs_delta; //non-persistent in this mode } else { txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); std::lock_guard l(vstatfs_lock); vstatfs += txc->statfs_delta; } txc->statfs_delta.reset(); } void BlueStore::_txc_state_proc(TransContext *txc) { while (true) { dout(10) << __func__ << " txc " << txc << " " << txc->get_state_name() << dendl; switch (txc->get_state()) { case TransContext::STATE_PREPARE: throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat); if (txc->ioc.has_pending_aios()) { txc->set_state(TransContext::STATE_AIO_WAIT); #ifdef WITH_BLKIN if (txc->trace) { txc->trace.keyval("pending aios", txc->ioc.num_pending.load()); } #endif txc->had_ios = true; _txc_aio_submit(txc); return; } // ** fall-thru ** case TransContext::STATE_AIO_WAIT: { mono_clock::duration lat = throttle.log_state_latency( *txc, logger, l_bluestore_state_aio_wait_lat); if (ceph::to_seconds(lat) >= cct->_conf->bluestore_log_op_age) { dout(0) << __func__ << " slow aio_wait, txc = " << txc << ", latency = " << lat << dendl; } } _txc_finish_io(txc); // may trigger blocked txc's too return; case TransContext::STATE_IO_DONE: ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io if (txc->had_ios) { ++txc->osr->txc_with_unstable_io; } throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat); txc->set_state(TransContext::STATE_KV_QUEUED); if (cct->_conf->bluestore_sync_submit_transaction) { if (txc->last_nid >= nid_max || txc->last_blobid >= blobid_max) { dout(20) << __func__ << " last_{nid,blobid} exceeds max, submit via kv thread" << dendl; } else if (txc->osr->kv_committing_serially) { dout(20) << __func__ << " prior txc submitted via kv thread, us too" << dendl; // note: this is starvation-prone. once we have a txc in a busy // sequencer that is committing serially it is possible to keep // submitting new transactions fast enough that we get stuck doing // so. the alternative is to block here... fixme? } else if (txc->osr->txc_with_unstable_io) { dout(20) << __func__ << " prior txc(s) with unstable ios " << txc->osr->txc_with_unstable_io.load() << dendl; } else if (cct->_conf->bluestore_debug_randomize_serial_transaction && rand() % cct->_conf->bluestore_debug_randomize_serial_transaction == 0) { dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread" << dendl; } else { _txc_apply_kv(txc, true); } } { std::lock_guard l(kv_lock); kv_queue.push_back(txc); if (!kv_sync_in_progress) { kv_sync_in_progress = true; kv_cond.notify_one(); } if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) { kv_queue_unsubmitted.push_back(txc); ++txc->osr->kv_committing_serially; } if (txc->had_ios) kv_ios++; kv_throttle_costs += txc->cost; } return; case TransContext::STATE_KV_SUBMITTED: _txc_committed_kv(txc); // ** fall-thru ** case TransContext::STATE_KV_DONE: throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat); if (txc->deferred_txn) { txc->set_state(TransContext::STATE_DEFERRED_QUEUED); _deferred_queue(txc); return; } txc->set_state(TransContext::STATE_FINISHING); break; case TransContext::STATE_DEFERRED_CLEANUP: throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat); txc->set_state(TransContext::STATE_FINISHING); // ** fall-thru ** case TransContext::STATE_FINISHING: throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat); _txc_finish(txc); return; default: derr << __func__ << " unexpected txc " << txc << " state " << txc->get_state_name() << dendl; ceph_abort_msg("unexpected txc state"); return; } } } void BlueStore::_txc_finish_io(TransContext *txc) { dout(20) << __func__ << " " << txc << dendl; /* * we need to preserve the order of kv transactions, * even though aio will complete in any order. */ OpSequencer *osr = txc->osr.get(); std::lock_guard l(osr->qlock); txc->set_state(TransContext::STATE_IO_DONE); txc->ioc.release_running_aios(); OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc); while (p != osr->q.begin()) { --p; if (p->get_state() < TransContext::STATE_IO_DONE) { dout(20) << __func__ << " " << txc << " blocked by " << &*p << " " << p->get_state_name() << dendl; return; } if (p->get_state() > TransContext::STATE_IO_DONE) { ++p; break; } } do { _txc_state_proc(&*p++); } while (p != osr->q.end() && p->get_state() == TransContext::STATE_IO_DONE); if (osr->kv_submitted_waiters) { osr->qcond.notify_all(); } } void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t) { dout(20) << __func__ << " txc " << txc << " onodes " << txc->onodes << " shared_blobs " << txc->shared_blobs << dendl; // finalize onodes for (auto o : txc->onodes) { _record_onode(o, t); o->flushing_count++; } // objects we modified but didn't affect the onode auto p = txc->modified_objects.begin(); while (p != txc->modified_objects.end()) { if (txc->onodes.count(*p) == 0) { (*p)->flushing_count++; ++p; } else { // remove dups with onodes list to avoid problems in _txc_finish p = txc->modified_objects.erase(p); } } // finalize shared_blobs for (auto sb : txc->shared_blobs) { string key; auto sbid = sb->get_sbid(); get_shared_blob_key(sbid, &key); if (sb->persistent->empty()) { dout(20) << __func__ << " shared_blob 0x" << std::hex << sbid << std::dec << " is empty" << dendl; t->rmkey(PREFIX_SHARED_BLOB, key); } else { bufferlist bl; encode(*(sb->persistent), bl); dout(20) << __func__ << " shared_blob 0x" << std::hex << sbid << std::dec << " is " << bl.length() << " " << *sb << dendl; t->set(PREFIX_SHARED_BLOB, key, bl); } } } void BlueStore::BSPerfTracker::update_from_perfcounters( PerfCounters &logger) { os_commit_latency_ns.consume_next( logger.get_tavg_ns( l_bluestore_commit_lat)); os_apply_latency_ns.consume_next( logger.get_tavg_ns( l_bluestore_commit_lat)); } // For every object we maintain tuple in the key-value // store. When a new object written to a zone, we insert the corresponding // tuple to the database. When an object is truncated, we remove the // corresponding tuple. When an object is overwritten, we remove the old tuple // and insert a new tuple corresponding to the new location of the object. The // cleaner can now identify live objects within the zone by // enumerating all the keys starting with prefix. void BlueStore::_zoned_update_cleaning_metadata(TransContext *txc) { for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) { std::string key; get_object_key(cct, o->oid, &key); for (auto offset : offsets) { if (offset > 0) { bufferlist offset_bl; encode(offset, offset_bl); txc->t->set(_zoned_get_prefix(offset), key, offset_bl); } else { txc->t->rmkey(_zoned_get_prefix(-offset), key); } } } } std::string BlueStore::_zoned_get_prefix(uint64_t offset) { uint64_t zone_num = offset / bdev->get_zone_size(); std::string zone_key; _key_encode_u64(zone_num, &zone_key); return PREFIX_ZONED_CL_INFO + zone_key; } // For now, to avoid interface changes we piggyback zone_size (in MiB) and the // first sequential zone number onto min_alloc_size and pass it to functions // Allocator::create and FreelistManager::create. uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) { uint64_t zone_size = bdev->get_zone_size(); uint64_t zone_size_mb = zone_size / (1024 * 1024); uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size; min_alloc_size |= (zone_size_mb << 32); min_alloc_size |= (first_seq_zone << 48); return min_alloc_size; } int BlueStore::_zoned_check_config_settings() { if (cct->_conf->bluestore_allocator != "zoned") { dout(1) << __func__ << " The drive is HM-SMR but " << cct->_conf->bluestore_allocator << " allocator is specified. " << "Only zoned allocator can be used with HM-SMR drive." << dendl; return -EINVAL; } // At least for now we want to use large min_alloc_size with HM-SMR drives. // Populating used_blocks bitset on a debug build of ceph-osd takes about 5 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size. if (min_alloc_size < 64 * 1024) { dout(1) << __func__ << " The drive is HM-SMR but min_alloc_size is " << min_alloc_size << ". " << "Please set to at least 64 KiB." << dendl; return -EINVAL; } // We don't want to defer writes with HM-SMR because it violates sequential // write requirement. if (prefer_deferred_size) { dout(1) << __func__ << " The drive is HM-SMR but prefer_deferred_size is " << prefer_deferred_size << ". " << "Please set to 0." << dendl; return -EINVAL; } return 0; } void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) { dout(20) << __func__ << " txc " << txc << std::hex << " allocated 0x" << txc->allocated << " released 0x" << txc->released << std::dec << dendl; // We have to handle the case where we allocate *and* deallocate the // same region in this transaction. The freelist doesn't like that. // (Actually, the only thing that cares is the BitmapFreelistManager // debug check. But that's important.) interval_set tmp_allocated, tmp_released; interval_set *pallocated = &txc->allocated; interval_set *preleased = &txc->released; if (!txc->allocated.empty() && !txc->released.empty()) { interval_set overlap; overlap.intersection_of(txc->allocated, txc->released); if (!overlap.empty()) { tmp_allocated = txc->allocated; tmp_allocated.subtract(overlap); tmp_released = txc->released; tmp_released.subtract(overlap); dout(20) << __func__ << " overlap 0x" << std::hex << overlap << ", new allocated 0x" << tmp_allocated << " released 0x" << tmp_released << std::dec << dendl; pallocated = &tmp_allocated; preleased = &tmp_released; } } // update freelist with non-overlap sets for (interval_set::iterator p = pallocated->begin(); p != pallocated->end(); ++p) { fm->allocate(p.get_start(), p.get_len(), t); } for (interval_set::iterator p = preleased->begin(); p != preleased->end(); ++p) { dout(20) << __func__ << " release 0x" << std::hex << p.get_start() << "~" << p.get_len() << std::dec << dendl; fm->release(p.get_start(), p.get_len(), t); } if (bdev->is_smr()) { _zoned_update_cleaning_metadata(txc); } _txc_update_store_statfs(txc); } void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction) { ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED); { #if defined(WITH_LTTNG) auto start = mono_clock::now(); #endif #ifdef WITH_BLKIN if (txc->trace) { txc->trace.event("db async submit"); } #endif int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t); ceph_assert(r == 0); txc->set_state(TransContext::STATE_KV_SUBMITTED); if (txc->osr->kv_submitted_waiters) { std::lock_guard l(txc->osr->qlock); txc->osr->qcond.notify_all(); } #if defined(WITH_LTTNG) if (txc->tracing) { tracepoint( bluestore, transaction_kv_submit_latency, txc->osr->get_sequencer_id(), txc->seq, sync_submit_transaction, ceph::to_seconds(mono_clock::now() - start)); } #endif } for (auto ls : { &txc->onodes, &txc->modified_objects }) { for (auto& o : *ls) { dout(20) << __func__ << " onode " << o << " had " << o->flushing_count << dendl; if (--o->flushing_count == 0 && o->waiting_count.load()) { std::lock_guard l(o->flush_lock); o->flush_cond.notify_all(); } } } } void BlueStore::_txc_committed_kv(TransContext *txc) { dout(20) << __func__ << " txc " << txc << dendl; throttle.complete_kv(*txc); { std::lock_guard l(txc->osr->qlock); txc->set_state(TransContext::STATE_KV_DONE); if (txc->ch->commit_queue) { txc->ch->commit_queue->queue(txc->oncommits); } else { finisher.queue(txc->oncommits); } } throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat); log_latency_fn( __func__, l_bluestore_commit_lat, mono_clock::now() - txc->start, cct->_conf->bluestore_log_op_age, [&](auto lat) { return ", txc = " + stringify(txc); } ); } void BlueStore::_txc_finish(TransContext *txc) { dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; ceph_assert(txc->get_state() == TransContext::STATE_FINISHING); for (auto& sb : txc->shared_blobs_written) { sb->finish_write(txc->seq); } txc->shared_blobs_written.clear(); while (!txc->removed_collections.empty()) { _queue_reap_collection(txc->removed_collections.front()); txc->removed_collections.pop_front(); } OpSequencerRef osr = txc->osr; bool empty = false; bool submit_deferred = false; OpSequencer::q_list_t releasing_txc; { std::lock_guard l(osr->qlock); txc->set_state(TransContext::STATE_DONE); bool notify = false; while (!osr->q.empty()) { TransContext *txc = &osr->q.front(); dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() << dendl; if (txc->get_state() != TransContext::STATE_DONE) { if (txc->get_state() == TransContext::STATE_PREPARE && deferred_aggressive) { // for _osr_drain_preceding() notify = true; } if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED && osr->q.size() > g_conf()->bluestore_max_deferred_txc) { submit_deferred = true; } break; } osr->q.pop_front(); releasing_txc.push_back(*txc); } if (osr->q.empty()) { dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; empty = true; } // only drain()/drain_preceding() need wakeup, // other cases use kv_submitted_waiters if (notify || empty) { osr->qcond.notify_all(); } } while (!releasing_txc.empty()) { // release to allocator only after all preceding txc's have also // finished any deferred writes that potentially land in these // blocks auto txc = &releasing_txc.front(); _txc_release_alloc(txc); releasing_txc.pop_front(); throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat); throttle.complete(*txc); delete txc; } if (submit_deferred) { // we're pinning memory; flush! we could be more fine-grained here but // i'm not sure it's worth the bother. deferred_try_submit(); } if (empty && osr->zombie) { std::lock_guard l(zombie_osr_lock); if (zombie_osr_set.erase(osr->cid)) { dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl; } else { dout(10) << __func__ << " empty zombie osr " << osr << " already reaped" << dendl; } } } void BlueStore::_txc_release_alloc(TransContext *txc) { // it's expected we're called with lazy_release_lock already taken! if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) { int r = 0; if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { r = bdev->queue_discard(txc->released); if (r == 0) { dout(10) << __func__ << "(queued) " << txc << " " << std::hex << txc->released << std::dec << dendl; goto out; } } else if (cct->_conf->bdev_enable_discard) { for (auto p = txc->released.begin(); p != txc->released.end(); ++p) { bdev->discard(p.get_start(), p.get_len()); } } dout(10) << __func__ << "(sync) " << txc << " " << std::hex << txc->released << std::dec << dendl; shared_alloc.a->release(txc->released); } out: txc->allocated.clear(); txc->released.clear(); } void BlueStore::_osr_attach(Collection *c) { // note: caller has RWLock on coll_map auto q = coll_map.find(c->cid); if (q != coll_map.end()) { c->osr = q->second->osr; ldout(cct, 10) << __func__ << " " << c->cid << " reusing osr " << c->osr << " from existing coll " << q->second << dendl; } else { std::lock_guard l(zombie_osr_lock); auto p = zombie_osr_set.find(c->cid); if (p == zombie_osr_set.end()) { c->osr = ceph::make_ref(this, next_sequencer_id++, c->cid); ldout(cct, 10) << __func__ << " " << c->cid << " fresh osr " << c->osr << dendl; } else { c->osr = p->second; zombie_osr_set.erase(p); ldout(cct, 10) << __func__ << " " << c->cid << " resurrecting zombie osr " << c->osr << dendl; c->osr->zombie = false; } } } void BlueStore::_osr_register_zombie(OpSequencer *osr) { std::lock_guard l(zombie_osr_lock); dout(10) << __func__ << " " << osr << " " << osr->cid << dendl; osr->zombie = true; auto i = zombie_osr_set.emplace(osr->cid, osr); // this is either a new insertion or the same osr is already there ceph_assert(i.second || i.first->second == osr); } void BlueStore::_osr_drain_preceding(TransContext *txc) { OpSequencer *osr = txc->osr.get(); dout(10) << __func__ << " " << txc << " osr " << osr << dendl; ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? { // submit anything pending osr->deferred_lock.lock(); if (osr->deferred_pending && !osr->deferred_running) { _deferred_submit_unlock(osr); } else { osr->deferred_lock.unlock(); } } { // wake up any previously finished deferred events std::lock_guard l(kv_lock); if (!kv_sync_in_progress) { kv_sync_in_progress = true; kv_cond.notify_one(); } } osr->drain_preceding(txc); --deferred_aggressive; dout(10) << __func__ << " " << osr << " done" << dendl; } void BlueStore::_osr_drain(OpSequencer *osr) { dout(10) << __func__ << " " << osr << dendl; ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? { // submit anything pending osr->deferred_lock.lock(); if (osr->deferred_pending && !osr->deferred_running) { _deferred_submit_unlock(osr); } else { osr->deferred_lock.unlock(); } } { // wake up any previously finished deferred events std::lock_guard l(kv_lock); if (!kv_sync_in_progress) { kv_sync_in_progress = true; kv_cond.notify_one(); } } osr->drain(); --deferred_aggressive; dout(10) << __func__ << " " << osr << " done" << dendl; } void BlueStore::_osr_drain_all() { dout(10) << __func__ << dendl; set s; vector zombies; { std::shared_lock l(coll_lock); for (auto& i : coll_map) { s.insert(i.second->osr); } } { std::lock_guard l(zombie_osr_lock); for (auto& i : zombie_osr_set) { s.insert(i.second); zombies.push_back(i.second); } } dout(20) << __func__ << " osr_set " << s << dendl; ++deferred_aggressive; { // submit anything pending deferred_try_submit(); } { // wake up any previously finished deferred events std::lock_guard l(kv_lock); kv_cond.notify_one(); } { std::lock_guard l(kv_finalize_lock); kv_finalize_cond.notify_one(); } for (auto osr : s) { dout(20) << __func__ << " drain " << osr << dendl; osr->drain(); } --deferred_aggressive; { std::lock_guard l(zombie_osr_lock); for (auto& osr : zombies) { if (zombie_osr_set.erase(osr->cid)) { dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl; ceph_assert(osr->q.empty()); } else if (osr->zombie) { dout(10) << __func__ << " empty zombie osr " << osr << " already reaped" << dendl; ceph_assert(osr->q.empty()); } else { dout(10) << __func__ << " empty zombie osr " << osr << " resurrected" << dendl; } } } dout(10) << __func__ << " done" << dendl; } void BlueStore::_kv_start() { dout(10) << __func__ << dendl; finisher.start(); kv_sync_thread.create("bstore_kv_sync"); kv_finalize_thread.create("bstore_kv_final"); } void BlueStore::_kv_stop() { dout(10) << __func__ << dendl; { std::unique_lock l{kv_lock}; while (!kv_sync_started) { kv_cond.wait(l); } kv_stop = true; kv_cond.notify_all(); } { std::unique_lock l{kv_finalize_lock}; while (!kv_finalize_started) { kv_finalize_cond.wait(l); } kv_finalize_stop = true; kv_finalize_cond.notify_all(); } kv_sync_thread.join(); kv_finalize_thread.join(); ceph_assert(removed_collections.empty()); { std::lock_guard l(kv_lock); kv_stop = false; } { std::lock_guard l(kv_finalize_lock); kv_finalize_stop = false; } dout(10) << __func__ << " stopping finishers" << dendl; finisher.wait_for_empty(); finisher.stop(); dout(10) << __func__ << " stopped" << dendl; } void BlueStore::_kv_sync_thread() { dout(10) << __func__ << " start" << dendl; deque deferred_stable_queue; ///< deferred ios done + stable std::unique_lock l{kv_lock}; ceph_assert(!kv_sync_started); kv_sync_started = true; kv_cond.notify_all(); auto t0 = mono_clock::now(); timespan twait = ceph::make_timespan(0); size_t kv_submitted = 0; while (true) { auto period = cct->_conf->bluestore_kv_sync_util_logging_s; auto observation_period = ceph::make_timespan(period); auto elapsed = mono_clock::now() - t0; if (period && elapsed >= observation_period) { dout(5) << __func__ << " utilization: idle " << twait << " of " << elapsed << ", submitted: " << kv_submitted < kv_submitting; deque deferred_done, deferred_stable; uint64_t aios = 0, costs = 0; dout(20) << __func__ << " committing " << kv_queue.size() << " submitting " << kv_queue_unsubmitted.size() << " deferred done " << deferred_done_queue.size() << " stable " << deferred_stable_queue.size() << dendl; kv_committing.swap(kv_queue); kv_submitting.swap(kv_queue_unsubmitted); deferred_done.swap(deferred_done_queue); deferred_stable.swap(deferred_stable_queue); aios = kv_ios; costs = kv_throttle_costs; kv_ios = 0; kv_throttle_costs = 0; l.unlock(); dout(30) << __func__ << " committing " << kv_committing << dendl; dout(30) << __func__ << " submitting " << kv_submitting << dendl; dout(30) << __func__ << " deferred_done " << deferred_done << dendl; dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl; auto start = mono_clock::now(); bool force_flush = false; // if bluefs is sharing the same device as data (only), then we // can rely on the bluefs commit to flush the device and make // deferred aios stable. that means that if we do have done deferred // txcs AND we are not on a single device, we need to force a flush. if (bluefs && bluefs_layout.single_shared_device()) { if (aios) { force_flush = true; } else if (kv_committing.empty() && deferred_stable.empty()) { force_flush = true; // there's nothing else to commit! } else if (deferred_aggressive) { force_flush = true; } } else { if (aios || !deferred_done.empty()) { force_flush = true; } else { dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl; } } if (force_flush) { dout(20) << __func__ << " num_aios=" << aios << " force_flush=" << (int)force_flush << ", flushing, deferred done->stable" << dendl; // flush/barrier on block device bdev->flush(); // if we flush then deferred done are now deferred stable deferred_stable.insert(deferred_stable.end(), deferred_done.begin(), deferred_done.end()); deferred_done.clear(); } auto after_flush = mono_clock::now(); // we will use one final transaction to force a sync KeyValueDB::Transaction synct = db->get_transaction(); // increase {nid,blobid}_max? note that this covers both the // case where we are approaching the max and the case we passed // it. in either case, we increase the max in the earlier txn // we submit. uint64_t new_nid_max = 0, new_blobid_max = 0; if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) { KeyValueDB::Transaction t = kv_submitting.empty() ? synct : kv_submitting.front()->t; new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc; bufferlist bl; encode(new_nid_max, bl); t->set(PREFIX_SUPER, "nid_max", bl); dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl; } if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) { KeyValueDB::Transaction t = kv_submitting.empty() ? synct : kv_submitting.front()->t; new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc; bufferlist bl; encode(new_blobid_max, bl); t->set(PREFIX_SUPER, "blobid_max", bl); dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl; } for (auto txc : kv_committing) { throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat); if (txc->get_state() == TransContext::STATE_KV_QUEUED) { ++kv_submitted; _txc_apply_kv(txc, false); --txc->osr->kv_committing_serially; } else { ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED); } if (txc->had_ios) { --txc->osr->txc_with_unstable_io; } } // release throttle *before* we commit. this allows new ops // to be prepared and enter pipeline while we are waiting on // the kv commit sync/flush. then hopefully on the next // iteration there will already be ops awake. otherwise, we // end up going to sleep, and then wake up when the very first // transaction is ready for commit. throttle.release_kv_throttle(costs); // cleanup sync deferred keys for (auto b : deferred_stable) { for (auto& txc : b->txcs) { bluestore_deferred_transaction_t& wt = *txc.deferred_txn; ceph_assert(wt.released.empty()); // only kraken did this string key; get_deferred_key(wt.seq, &key); synct->rm_single_key(PREFIX_DEFERRED, key); } } #if defined(WITH_LTTNG) auto sync_start = mono_clock::now(); #endif // submit synct synchronously (block and wait for it to commit) int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct); ceph_assert(r == 0); #ifdef WITH_BLKIN for (auto txc : kv_committing) { if (txc->trace) { txc->trace.event("db sync submit"); txc->trace.keyval("kv_committing size", kv_committing.size()); } } #endif int committing_size = kv_committing.size(); int deferred_size = deferred_stable.size(); #if defined(WITH_LTTNG) double sync_latency = ceph::to_seconds(mono_clock::now() - sync_start); for (auto txc: kv_committing) { if (txc->tracing) { tracepoint( bluestore, transaction_kv_sync_latency, txc->osr->get_sequencer_id(), txc->seq, kv_committing.size(), deferred_done.size(), deferred_stable.size(), sync_latency); } } #endif { std::unique_lock m{kv_finalize_lock}; if (kv_committing_to_finalize.empty()) { kv_committing_to_finalize.swap(kv_committing); } else { kv_committing_to_finalize.insert( kv_committing_to_finalize.end(), kv_committing.begin(), kv_committing.end()); kv_committing.clear(); } if (deferred_stable_to_finalize.empty()) { deferred_stable_to_finalize.swap(deferred_stable); } else { deferred_stable_to_finalize.insert( deferred_stable_to_finalize.end(), deferred_stable.begin(), deferred_stable.end()); deferred_stable.clear(); } if (!kv_finalize_in_progress) { kv_finalize_in_progress = true; kv_finalize_cond.notify_one(); } } if (new_nid_max) { nid_max = new_nid_max; dout(10) << __func__ << " nid_max now " << nid_max << dendl; } if (new_blobid_max) { blobid_max = new_blobid_max; dout(10) << __func__ << " blobid_max now " << blobid_max << dendl; } { auto finish = mono_clock::now(); ceph::timespan dur_flush = after_flush - start; ceph::timespan dur_kv = finish - after_flush; ceph::timespan dur = finish - start; dout(20) << __func__ << " committed " << committing_size << " cleaned " << deferred_size << " in " << dur << " (" << dur_flush << " flush + " << dur_kv << " kv commit)" << dendl; log_latency("kv_flush", l_bluestore_kv_flush_lat, dur_flush, cct->_conf->bluestore_log_op_age); log_latency("kv_commit", l_bluestore_kv_commit_lat, dur_kv, cct->_conf->bluestore_log_op_age); log_latency("kv_sync", l_bluestore_kv_sync_lat, dur, cct->_conf->bluestore_log_op_age); } l.lock(); // previously deferred "done" are now "stable" by virtue of this // commit cycle. deferred_stable_queue.swap(deferred_done); } } dout(10) << __func__ << " finish" << dendl; kv_sync_started = false; } void BlueStore::_kv_finalize_thread() { deque kv_committed; deque deferred_stable; dout(10) << __func__ << " start" << dendl; std::unique_lock l(kv_finalize_lock); ceph_assert(!kv_finalize_started); kv_finalize_started = true; kv_finalize_cond.notify_all(); while (true) { ceph_assert(kv_committed.empty()); ceph_assert(deferred_stable.empty()); if (kv_committing_to_finalize.empty() && deferred_stable_to_finalize.empty()) { if (kv_finalize_stop) break; dout(20) << __func__ << " sleep" << dendl; kv_finalize_in_progress = false; kv_finalize_cond.wait(l); dout(20) << __func__ << " wake" << dendl; } else { kv_committed.swap(kv_committing_to_finalize); deferred_stable.swap(deferred_stable_to_finalize); l.unlock(); dout(20) << __func__ << " kv_committed " << kv_committed << dendl; dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl; auto start = mono_clock::now(); while (!kv_committed.empty()) { TransContext *txc = kv_committed.front(); ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED); _txc_state_proc(txc); kv_committed.pop_front(); } for (auto b : deferred_stable) { auto p = b->txcs.begin(); while (p != b->txcs.end()) { TransContext *txc = &*p; p = b->txcs.erase(p); // unlink here because _txc_state_proc(txc); // this may destroy txc } delete b; } deferred_stable.clear(); if (!deferred_aggressive) { if (deferred_queue_size >= deferred_batch_ops.load() || throttle.should_submit_deferred()) { deferred_try_submit(); } } // this is as good a place as any ... _reap_collections(); logger->set(l_bluestore_fragmentation, (uint64_t)(shared_alloc.a->get_fragmentation() * 1000)); log_latency("kv_final", l_bluestore_kv_final_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); l.lock(); } } dout(10) << __func__ << " finish" << dendl; kv_finalize_started = false; } void BlueStore::_zoned_cleaner_start() { dout(10) << __func__ << dendl; zoned_cleaner_thread.create("bstore_zcleaner"); } void BlueStore::_zoned_cleaner_stop() { dout(10) << __func__ << dendl; { std::unique_lock l{zoned_cleaner_lock}; while (!zoned_cleaner_started) { zoned_cleaner_cond.wait(l); } zoned_cleaner_stop = true; zoned_cleaner_cond.notify_all(); } zoned_cleaner_thread.join(); { std::lock_guard l{zoned_cleaner_lock}; zoned_cleaner_stop = false; } dout(10) << __func__ << " done" << dendl; } void BlueStore::_zoned_cleaner_thread() { dout(10) << __func__ << " start" << dendl; std::unique_lock l{zoned_cleaner_lock}; ceph_assert(!zoned_cleaner_started); zoned_cleaner_started = true; zoned_cleaner_cond.notify_all(); std::deque zones_to_clean; while (true) { if (zoned_cleaner_queue.empty()) { if (zoned_cleaner_stop) { break; } dout(20) << __func__ << " sleep" << dendl; zoned_cleaner_cond.wait(l); dout(20) << __func__ << " wake" << dendl; } else { zones_to_clean.swap(zoned_cleaner_queue); l.unlock(); while (!zones_to_clean.empty()) { _zoned_clean_zone(zones_to_clean.front()); zones_to_clean.pop_front(); } l.lock(); } } dout(10) << __func__ << " finish" << dendl; zoned_cleaner_started = false; } void BlueStore::_zoned_clean_zone(uint64_t zone_num) { dout(10) << __func__ << " cleaning zone " << zone_num << dendl; } bluestore_deferred_op_t *BlueStore::_get_deferred_op( TransContext *txc, uint64_t len) { if (!txc->deferred_txn) { txc->deferred_txn = new bluestore_deferred_transaction_t; } txc->deferred_txn->ops.push_back(bluestore_deferred_op_t()); logger->inc(l_bluestore_write_deferred); logger->inc(l_bluestore_write_deferred_bytes, len); return &txc->deferred_txn->ops.back(); } void BlueStore::_deferred_queue(TransContext *txc) { dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl; DeferredBatch *tmp; txc->osr->deferred_lock.lock(); { if (!txc->osr->deferred_pending) { tmp = new DeferredBatch(cct, txc->osr.get()); } else { tmp = txc->osr->deferred_pending; } } tmp->txcs.push_back(*txc); bluestore_deferred_transaction_t& wt = *txc->deferred_txn; for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) { const auto& op = *opi; ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE); bufferlist::const_iterator p = op.data.begin(); for (auto e : op.extents) { tmp->prepare_write(cct, wt.seq, e.offset, e.length, p); } } { ++deferred_queue_size; txc->osr->deferred_pending = tmp; // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty. // So we should add osr into deferred_queue. if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) { deferred_lock.lock(); deferred_queue.push_back(*txc->osr); deferred_lock.unlock(); } if (deferred_aggressive && !txc->osr->deferred_running) { _deferred_submit_unlock(txc->osr.get()); } else { txc->osr->deferred_lock.unlock(); } } } void BlueStore::deferred_try_submit() { dout(20) << __func__ << " " << deferred_queue.size() << " osrs, " << deferred_queue_size << " txcs" << dendl; vector osrs; { std::lock_guard l(deferred_lock); osrs.reserve(deferred_queue.size()); for (auto& osr : deferred_queue) { osrs.push_back(&osr); } } for (auto& osr : osrs) { osr->deferred_lock.lock(); if (osr->deferred_pending) { if (!osr->deferred_running) { _deferred_submit_unlock(osr.get()); } else { osr->deferred_lock.unlock(); dout(20) << __func__ << " osr " << osr << " already has running" << dendl; } } else { osr->deferred_lock.unlock(); dout(20) << __func__ << " osr " << osr << " has no pending" << dendl; } } { std::lock_guard l(deferred_lock); deferred_last_submitted = ceph_clock_now(); } } void BlueStore::_deferred_submit_unlock(OpSequencer *osr) { dout(10) << __func__ << " osr " << osr << " " << osr->deferred_pending->iomap.size() << " ios pending " << dendl; ceph_assert(osr->deferred_pending); ceph_assert(!osr->deferred_running); auto b = osr->deferred_pending; deferred_queue_size -= b->seq_bytes.size(); ceph_assert(deferred_queue_size >= 0); osr->deferred_running = osr->deferred_pending; osr->deferred_pending = nullptr; osr->deferred_lock.unlock(); for (auto& txc : b->txcs) { throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat); } uint64_t start = 0, pos = 0; bufferlist bl; auto i = b->iomap.begin(); while (true) { if (i == b->iomap.end() || i->first != pos) { if (bl.length()) { dout(20) << __func__ << " write 0x" << std::hex << start << "~" << bl.length() << " crc " << bl.crc32c(-1) << std::dec << dendl; if (!g_conf()->bluestore_debug_omit_block_device_write) { logger->inc(l_bluestore_deferred_write_ops); logger->inc(l_bluestore_deferred_write_bytes, bl.length()); int r = bdev->aio_write(start, bl, &b->ioc, false); ceph_assert(r == 0); } } if (i == b->iomap.end()) { break; } start = 0; pos = i->first; bl.clear(); } dout(20) << __func__ << " seq " << i->second.seq << " 0x" << std::hex << pos << "~" << i->second.bl.length() << std::dec << dendl; if (!bl.length()) { start = pos; } pos += i->second.bl.length(); bl.claim_append(i->second.bl); ++i; } bdev->aio_submit(&b->ioc); } struct C_DeferredTrySubmit : public Context { BlueStore *store; C_DeferredTrySubmit(BlueStore *s) : store(s) {} void finish(int r) { store->deferred_try_submit(); } }; void BlueStore::_deferred_aio_finish(OpSequencer *osr) { dout(10) << __func__ << " osr " << osr << dendl; ceph_assert(osr->deferred_running); DeferredBatch *b = osr->deferred_running; { osr->deferred_lock.lock(); ceph_assert(osr->deferred_running == b); osr->deferred_running = nullptr; if (!osr->deferred_pending) { dout(20) << __func__ << " dequeueing" << dendl; { deferred_lock.lock(); auto q = deferred_queue.iterator_to(*osr); deferred_queue.erase(q); deferred_lock.unlock(); } osr->deferred_lock.unlock(); } else { osr->deferred_lock.unlock(); if (deferred_aggressive) { dout(20) << __func__ << " queuing async deferred_try_submit" << dendl; finisher.queue(new C_DeferredTrySubmit(this)); } else { dout(20) << __func__ << " leaving queued, more pending" << dendl; } } } { uint64_t costs = 0; { for (auto& i : b->txcs) { TransContext *txc = &i; throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat); txc->set_state(TransContext::STATE_DEFERRED_CLEANUP); costs += txc->cost; } } throttle.release_deferred_throttle(costs); } { std::lock_guard l(kv_lock); deferred_done_queue.emplace_back(b); // in the normal case, do not bother waking up the kv thread; it will // catch us on the next commit anyway. if (deferred_aggressive && !kv_sync_in_progress) { kv_sync_in_progress = true; kv_cond.notify_one(); } } } int BlueStore::_deferred_replay() { dout(10) << __func__ << " start" << dendl; int count = 0; int r = 0; interval_set bluefs_extents; if (bluefs) { bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents); } CollectionRef ch = _get_collection(coll_t::meta()); bool fake_ch = false; if (!ch) { // hmm, replaying initial mkfs? ch = static_cast(create_new_collection(coll_t::meta()).get()); fake_ch = true; } OpSequencer *osr = static_cast(ch->osr.get()); KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED); for (it->lower_bound(string()); it->valid(); it->next(), ++count) { dout(20) << __func__ << " replay " << pretty_binary_string(it->key()) << dendl; bluestore_deferred_transaction_t *deferred_txn = new bluestore_deferred_transaction_t; bufferlist bl = it->value(); auto p = bl.cbegin(); try { decode(*deferred_txn, p); } catch (ceph::buffer::error& e) { derr << __func__ << " failed to decode deferred txn " << pretty_binary_string(it->key()) << dendl; delete deferred_txn; r = -EIO; goto out; } bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents); if (has_some) { TransContext *txc = _txc_create(ch.get(), osr, nullptr); txc->deferred_txn = deferred_txn; txc->set_state(TransContext::STATE_KV_DONE); _txc_state_proc(txc); } else { delete deferred_txn; } } out: dout(20) << __func__ << " draining osr" << dendl; _osr_register_zombie(osr); _osr_drain_all(); if (fake_ch) { new_coll_map.clear(); } dout(10) << __func__ << " completed " << count << " events" << dendl; return r; } bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn, interval_set& bluefs_extents) { bool has_some = false; dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl; auto it = deferred_txn->ops.begin(); while (it != deferred_txn->ops.end()) { // We process a pair of _data_/_extents_ (here: it->data/it->extents) // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_ // example: // +------------+---------------+---------------+---------------+ // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff | // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 | // | in bluefs? | no | yes | no | // +------------+---------------+---------------+---------------+ // result: // +------------+---------------+---------------+ // | data | aaaaaaaabbbbb | ddddeeeeeefff | // | extent | 40000 - 44000 | 58000 - 60000 | // +------------+---------------+---------------+ PExtentVector new_extents; ceph::buffer::list new_data; uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data dout(30) << __func__ << " input extents: " << it->extents << dendl; for (auto& e: it->extents) { interval_set region; region.insert(e.offset, e.length); auto mi = bluefs_extents.lower_bound(e.offset); if (mi != bluefs_extents.begin()) { --mi; if (mi.get_end() <= e.offset) { ++mi; } } while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) { // The interval_set does not like (asserts) when we erase interval that does not exist. // Hence we do we implement (region-mi) by ((region+mi)-mi). region.union_insert(mi.get_start(), mi.get_len()); region.erase(mi.get_start(), mi.get_len()); ++mi; } // 'region' is now a subset of e, without parts used by bluefs // we trim coresponding parts from it->data (actally constructing new_data / new_extents) for (auto ki = region.begin(); ki != region.end(); ki++) { ceph::buffer::list chunk; // A chunk from it->data; data_offset is a an offset where 'e' was located; // 'ki.get_start() - e.offset' is an offset of ki inside 'e'. chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len()); new_data.claim_append(chunk); new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len())); } data_offset += e.length; } dout(30) << __func__ << " output extents: " << new_extents << dendl; if (it->data.length() != new_data.length()) { dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl; } if (new_extents.size() == 0) { it = deferred_txn->ops.erase(it); } else { has_some = true; std::swap(it->extents, new_extents); std::swap(it->data, new_data); ++it; } } return has_some; } // --------------------------- // transactions int BlueStore::queue_transactions( CollectionHandle& ch, vector& tls, TrackedOpRef op, ThreadPool::TPHandle *handle) { FUNCTRACE(cct); list on_applied, on_commit, on_applied_sync; ObjectStore::Transaction::collect_contexts( tls, &on_applied, &on_commit, &on_applied_sync); auto start = mono_clock::now(); Collection *c = static_cast(ch.get()); OpSequencer *osr = c->osr.get(); dout(10) << __func__ << " ch " << c << " " << c->cid << dendl; // prepare TransContext *txc = _txc_create(static_cast(ch.get()), osr, &on_commit, op); // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O // submission to happen atomically because if I/O submission happens in a // different order than I/O allocation, we end up issuing non-sequential // writes to the drive. This is a temporary solution until ZONE APPEND // support matures in the kernel. For more information please see: // https://www.usenix.org/conference/vault20/presentation/bjorling if (bdev->is_smr()) { atomic_alloc_and_submit_lock.lock(); } for (vector::iterator p = tls.begin(); p != tls.end(); ++p) { txc->bytes += (*p).get_num_bytes(); _txc_add_transaction(txc, &(*p)); } _txc_calc_cost(txc); _txc_write_nodes(txc, txc->t); // journal deferred items if (txc->deferred_txn) { txc->deferred_txn->seq = ++deferred_seq; bufferlist bl; encode(*txc->deferred_txn, bl); string key; get_deferred_key(txc->deferred_txn->seq, &key); txc->t->set(PREFIX_DEFERRED, key, bl); } _txc_finalize_kv(txc, txc->t); #ifdef WITH_BLKIN if (txc->trace) { txc->trace.event("txc encode finished"); } #endif if (handle) handle->suspend_tp_timeout(); auto tstart = mono_clock::now(); if (!throttle.try_start_transaction( *db, *txc, tstart)) { // ensure we do not block here because of deferred writes dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive" << dendl; ++deferred_aggressive; deferred_try_submit(); { // wake up any previously finished deferred events std::lock_guard l(kv_lock); if (!kv_sync_in_progress) { kv_sync_in_progress = true; kv_cond.notify_one(); } } throttle.finish_start_transaction(*db, *txc, tstart); --deferred_aggressive; } auto tend = mono_clock::now(); if (handle) handle->reset_tp_timeout(); logger->inc(l_bluestore_txc); // execute (start) _txc_state_proc(txc); if (bdev->is_smr()) { atomic_alloc_and_submit_lock.unlock(); } // we're immediately readable (unlike FileStore) for (auto c : on_applied_sync) { c->complete(0); } if (!on_applied.empty()) { if (c->commit_queue) { c->commit_queue->queue(on_applied); } else { finisher.queue(on_applied); } } #ifdef WITH_BLKIN if (txc->trace) { txc->trace.event("txc applied"); } #endif log_latency("submit_transact", l_bluestore_submit_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age); log_latency("throttle_transact", l_bluestore_throttle_lat, tend - tstart, cct->_conf->bluestore_log_op_age); return 0; } void BlueStore::_txc_aio_submit(TransContext *txc) { dout(10) << __func__ << " txc " << txc << dendl; bdev->aio_submit(&txc->ioc); } void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) { Transaction::iterator i = t->begin(); _dump_transaction<30>(cct, t); vector cvec(i.colls.size()); unsigned j = 0; for (vector::iterator p = i.colls.begin(); p != i.colls.end(); ++p, ++j) { cvec[j] = _get_collection(*p); } vector ovec(i.objects.size()); for (int pos = 0; i.have_op(); ++pos) { Transaction::Op *op = i.decode_op(); int r = 0; // no coll or obj if (op->op == Transaction::OP_NOP) continue; // collection operations CollectionRef &c = cvec[op->cid]; // initialize osd_pool_id and do a smoke test that all collections belong // to the same pool spg_t pgid; if (!!c ? c->cid.is_pg(&pgid) : false) { ceph_assert(txc->osd_pool_id == META_POOL_ID || txc->osd_pool_id == pgid.pool()); txc->osd_pool_id = pgid.pool(); } switch (op->op) { case Transaction::OP_RMCOLL: { const coll_t &cid = i.get_cid(op->cid); r = _remove_collection(txc, cid, &c); if (!r) continue; } break; case Transaction::OP_MKCOLL: { ceph_assert(!c); const coll_t &cid = i.get_cid(op->cid); r = _create_collection(txc, cid, op->split_bits, &c); if (!r) continue; } break; case Transaction::OP_SPLIT_COLLECTION: ceph_abort_msg("deprecated"); break; case Transaction::OP_SPLIT_COLLECTION2: { uint32_t bits = op->split_bits; uint32_t rem = op->split_rem; r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem); if (!r) continue; } break; case Transaction::OP_MERGE_COLLECTION: { uint32_t bits = op->split_bits; r = _merge_collection(txc, &c, cvec[op->dest_cid], bits); if (!r) continue; } break; case Transaction::OP_COLL_HINT: { uint32_t type = op->hint; bufferlist hint; i.decode_bl(hint); auto hiter = hint.cbegin(); if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { uint32_t pg_num; uint64_t num_objs; decode(pg_num, hiter); decode(num_objs, hiter); dout(10) << __func__ << " collection hint objects is a no-op, " << " pg_num " << pg_num << " num_objects " << num_objs << dendl; } else { // Ignore the hint dout(10) << __func__ << " unknown collection hint " << type << dendl; } continue; } break; case Transaction::OP_COLL_SETATTR: r = -EOPNOTSUPP; break; case Transaction::OP_COLL_RMATTR: r = -EOPNOTSUPP; break; case Transaction::OP_COLL_RENAME: ceph_abort_msg("not implemented"); break; } if (r < 0) { derr << __func__ << " error " << cpp_strerror(r) << " not handled on operation " << op->op << " (op " << pos << ", counting from 0)" << dendl; _dump_transaction<0>(cct, t); ceph_abort_msg("unexpected error"); } // these operations implicity create the object bool create = false; if (op->op == Transaction::OP_TOUCH || op->op == Transaction::OP_CREATE || op->op == Transaction::OP_WRITE || op->op == Transaction::OP_ZERO) { create = true; } // object operations std::unique_lock l(c->lock); OnodeRef &o = ovec[op->oid]; if (!o) { ghobject_t oid = i.get_oid(op->oid); o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE); } if (!create && (!o || !o->exists)) { dout(10) << __func__ << " op " << op->op << " got ENOENT on " << i.get_oid(op->oid) << dendl; r = -ENOENT; goto endop; } switch (op->op) { case Transaction::OP_CREATE: case Transaction::OP_TOUCH: r = _touch(txc, c, o); break; case Transaction::OP_WRITE: { uint64_t off = op->off; uint64_t len = op->len; uint32_t fadvise_flags = i.get_fadvise_flags(); bufferlist bl; i.decode_bl(bl); r = _write(txc, c, o, off, len, bl, fadvise_flags); } break; case Transaction::OP_ZERO: { uint64_t off = op->off; uint64_t len = op->len; r = _zero(txc, c, o, off, len); } break; case Transaction::OP_TRIMCACHE: { // deprecated, no-op } break; case Transaction::OP_TRUNCATE: { uint64_t off = op->off; r = _truncate(txc, c, o, off); } break; case Transaction::OP_REMOVE: { r = _remove(txc, c, o); } break; case Transaction::OP_SETATTR: { string name = i.decode_string(); bufferptr bp; i.decode_bp(bp); r = _setattr(txc, c, o, name, bp); } break; case Transaction::OP_SETATTRS: { map aset; i.decode_attrset(aset); r = _setattrs(txc, c, o, aset); } break; case Transaction::OP_RMATTR: { string name = i.decode_string(); r = _rmattr(txc, c, o, name); } break; case Transaction::OP_RMATTRS: { r = _rmattrs(txc, c, o); } break; case Transaction::OP_CLONE: { OnodeRef& no = ovec[op->dest_oid]; if (!no) { const ghobject_t& noid = i.get_oid(op->dest_oid); no = c->get_onode(noid, true); } r = _clone(txc, c, o, no); } break; case Transaction::OP_CLONERANGE: ceph_abort_msg("deprecated"); break; case Transaction::OP_CLONERANGE2: { OnodeRef& no = ovec[op->dest_oid]; if (!no) { const ghobject_t& noid = i.get_oid(op->dest_oid); no = c->get_onode(noid, true); } uint64_t srcoff = op->off; uint64_t len = op->len; uint64_t dstoff = op->dest_off; r = _clone_range(txc, c, o, no, srcoff, len, dstoff); } break; case Transaction::OP_COLL_ADD: ceph_abort_msg("not implemented"); break; case Transaction::OP_COLL_REMOVE: ceph_abort_msg("not implemented"); break; case Transaction::OP_COLL_MOVE: ceph_abort_msg("deprecated"); break; case Transaction::OP_COLL_MOVE_RENAME: case Transaction::OP_TRY_RENAME: { ceph_assert(op->cid == op->dest_cid); const ghobject_t& noid = i.get_oid(op->dest_oid); OnodeRef& no = ovec[op->dest_oid]; if (!no) { no = c->get_onode(noid, false); } r = _rename(txc, c, o, no, noid); } break; case Transaction::OP_OMAP_CLEAR: { r = _omap_clear(txc, c, o); } break; case Transaction::OP_OMAP_SETKEYS: { bufferlist aset_bl; i.decode_attrset_bl(&aset_bl); r = _omap_setkeys(txc, c, o, aset_bl); } break; case Transaction::OP_OMAP_RMKEYS: { bufferlist keys_bl; i.decode_keyset_bl(&keys_bl); r = _omap_rmkeys(txc, c, o, keys_bl); } break; case Transaction::OP_OMAP_RMKEYRANGE: { string first, last; first = i.decode_string(); last = i.decode_string(); r = _omap_rmkey_range(txc, c, o, first, last); } break; case Transaction::OP_OMAP_SETHEADER: { bufferlist bl; i.decode_bl(bl); r = _omap_setheader(txc, c, o, bl); } break; case Transaction::OP_SETALLOCHINT: { r = _set_alloc_hint(txc, c, o, op->expected_object_size, op->expected_write_size, op->hint); } break; default: derr << __func__ << " bad op " << op->op << dendl; ceph_abort(); } endop: if (r < 0) { bool ok = false; if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || op->op == Transaction::OP_CLONE || op->op == Transaction::OP_CLONERANGE2 || op->op == Transaction::OP_COLL_ADD || op->op == Transaction::OP_SETATTR || op->op == Transaction::OP_SETATTRS || op->op == Transaction::OP_RMATTR || op->op == Transaction::OP_OMAP_SETKEYS || op->op == Transaction::OP_OMAP_RMKEYS || op->op == Transaction::OP_OMAP_RMKEYRANGE || op->op == Transaction::OP_OMAP_SETHEADER)) // -ENOENT is usually okay ok = true; if (r == -ENODATA) ok = true; if (!ok) { const char *msg = "unexpected error code"; if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || op->op == Transaction::OP_CLONE || op->op == Transaction::OP_CLONERANGE2)) msg = "ENOENT on clone suggests osd bug"; if (r == -ENOSPC) // For now, if we hit _any_ ENOSPC, crash, before we do any damage // by partially applying transactions. msg = "ENOSPC from bluestore, misconfigured cluster"; if (r == -ENOTEMPTY) { msg = "ENOTEMPTY suggests garbage data in osd data dir"; } derr << __func__ << " error " << cpp_strerror(r) << " not handled on operation " << op->op << " (op " << pos << ", counting from 0)" << dendl; derr << msg << dendl; _dump_transaction<0>(cct, t); ceph_abort_msg("unexpected error"); } } } } // ----------------- // write operations int BlueStore::_touch(TransContext *txc, CollectionRef& c, OnodeRef &o) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; int r = 0; _assign_nid(txc, o); txc->write_onode(o); dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } void BlueStore::_pad_zeros( bufferlist *bl, uint64_t *offset, uint64_t chunk_size) { auto length = bl->length(); dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length << " chunk_size 0x" << chunk_size << std::dec << dendl; dout(40) << "before:\n"; bl->hexdump(*_dout); *_dout << dendl; // front size_t front_pad = *offset % chunk_size; size_t back_pad = 0; size_t pad_count = 0; if (front_pad) { size_t front_copy = std::min(chunk_size - front_pad, length); bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size); z.zero(0, front_pad, false); pad_count += front_pad; bl->begin().copy(front_copy, z.c_str() + front_pad); if (front_copy + front_pad < chunk_size) { back_pad = chunk_size - (length + front_pad); z.zero(front_pad + length, back_pad, false); pad_count += back_pad; } bufferlist old, t; old.swap(*bl); t.substr_of(old, front_copy, length - front_copy); bl->append(z); bl->claim_append(t); *offset -= front_pad; length += pad_count; } // back uint64_t end = *offset + length; unsigned back_copy = end % chunk_size; if (back_copy) { ceph_assert(back_pad == 0); back_pad = chunk_size - back_copy; ceph_assert(back_copy <= length); bufferptr tail(chunk_size); bl->begin(length - back_copy).copy(back_copy, tail.c_str()); tail.zero(back_copy, back_pad, false); bufferlist old; old.swap(*bl); bl->substr_of(old, 0, length - back_copy); bl->append(tail); length += back_pad; pad_count += back_pad; } dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x" << back_pad << " on front/back, now 0x" << *offset << "~" << length << std::dec << dendl; dout(40) << "after:\n"; bl->hexdump(*_dout); *_dout << dendl; if (pad_count) logger->inc(l_bluestore_write_pad_bytes, pad_count); ceph_assert(bl->length() == length); } void BlueStore::_do_write_small( TransContext *txc, CollectionRef &c, OnodeRef o, uint64_t offset, uint64_t length, bufferlist::iterator& blp, WriteContext *wctx) { dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; ceph_assert(length < min_alloc_size); uint64_t end_offs = offset + length; logger->inc(l_bluestore_write_small); logger->inc(l_bluestore_write_small_bytes, length); bufferlist bl; blp.copy(length, bl); auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); auto min_off = offset >= max_bsize ? offset - max_bsize : 0; uint32_t alloc_len = min_alloc_size; auto offset0 = p2align(offset, alloc_len); bool any_change; // search suitable extent in both forward and reverse direction in // [offset - target_max_blob_size, offset + target_max_blob_size] range // then check if blob can be reused via can_reuse_blob func or apply // direct/deferred write (the latter for extents including or higher // than 'offset' only). o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off); // On zoned devices, the first goal is to support non-overwrite workloads, // such as RGW, with large, aligned objects. Therefore, for user writes // _do_write_small should not trigger. OSDs, however, write and update a tiny // amount of metadata, such as OSD maps, to disk. For those cases, we // temporarily just pad them to min_alloc_size and write them to a new place // on every update. if (bdev->is_smr()) { BlobRef b = c->new_blob(); uint64_t b_off = p2phase(offset, alloc_len); uint64_t b_off0 = b_off; _pad_zeros(&bl, &b_off0, min_alloc_size); o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true); return; } // Look for an existing mutable blob we can use. auto begin = o->extent_map.extent_map.begin(); auto end = o->extent_map.extent_map.end(); auto ep = o->extent_map.seek_lextent(offset); if (ep != begin) { --ep; if (ep->blob_end() <= offset) { ++ep; } } auto prev_ep = end; if (ep != begin) { prev_ep = ep; --prev_ep; } boost::container::flat_set inspected_blobs; // We don't want to have more blobs than min alloc units fit // into 2 max blobs size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1; bool above_blob_threshold = false; inspected_blobs.reserve(blob_threshold); uint64_t max_off = 0; auto start_ep = ep; auto end_ep = ep; // exclusively do { any_change = false; if (ep != end && ep->logical_offset < offset + max_bsize) { BlobRef b = ep->blob; if (!above_blob_threshold) { inspected_blobs.insert(&b->get_blob()); above_blob_threshold = inspected_blobs.size() >= blob_threshold; } max_off = ep->logical_end(); auto bstart = ep->blob_start(); dout(20) << __func__ << " considering " << *b << " bstart 0x" << std::hex << bstart << std::dec << dendl; if (bstart >= end_offs) { dout(20) << __func__ << " ignoring distant " << *b << dendl; } else if (!b->get_blob().is_mutable()) { dout(20) << __func__ << " ignoring immutable " << *b << dendl; } else if (ep->logical_offset % min_alloc_size != ep->blob_offset % min_alloc_size) { dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl; } else { uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); // can we pad our head/tail out with zeros? uint64_t head_pad, tail_pad; head_pad = p2phase(offset, chunk_size); tail_pad = p2nphase(end_offs, chunk_size); if (head_pad || tail_pad) { o->extent_map.fault_range(db, offset - head_pad, end_offs - offset + head_pad + tail_pad); } if (head_pad && o->extent_map.has_any_lextents(offset - head_pad, head_pad)) { head_pad = 0; } if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) { tail_pad = 0; } uint64_t b_off = offset - head_pad - bstart; uint64_t b_len = length + head_pad + tail_pad; // direct write into unused blocks of an existing mutable blob? if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && b->get_blob().get_ondisk_length() >= b_off + b_len && b->get_blob().is_unused(b_off, b_len) && b->get_blob().is_allocated(b_off, b_len)) { _apply_padding(head_pad, tail_pad, bl); dout(20) << __func__ << " write to unused 0x" << std::hex << b_off << "~" << b_len << " pad 0x" << head_pad << " + 0x" << tail_pad << std::dec << " of mutable " << *b << dendl; _buffer_cache_write(txc, b, b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); if (!g_conf()->bluestore_debug_omit_block_device_write) { if (b_len < prefer_deferred_size) { dout(20) << __func__ << " deferring small 0x" << std::hex << b_len << std::dec << " unused write via deferred" << dendl; bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); op->op = bluestore_deferred_op_t::OP_WRITE; b->get_blob().map( b_off, b_len, [&](uint64_t offset, uint64_t length) { op->extents.emplace_back(bluestore_pextent_t(offset, length)); return 0; }); op->data = bl; } else { b->get_blob().map_bl( b_off, bl, [&](uint64_t offset, bufferlist& t) { bdev->aio_write(offset, t, &txc->ioc, wctx->buffered); }); } } b->dirty_blob().calc_csum(b_off, bl); dout(20) << __func__ << " lex old " << *ep << dendl; Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, b, &wctx->old_extents); b->dirty_blob().mark_used(le->blob_offset, le->length); txc->statfs_delta.stored() += le->length; dout(20) << __func__ << " lex " << *le << dendl; logger->inc(l_bluestore_write_small_unused); return; } // read some data to fill out the chunk? uint64_t head_read = p2phase(b_off, chunk_size); uint64_t tail_read = p2nphase(b_off + b_len, chunk_size); if ((head_read || tail_read) && (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) && head_read + tail_read < min_alloc_size) { b_off -= head_read; b_len += head_read + tail_read; } else { head_read = tail_read = 0; } // chunk-aligned deferred overwrite? if (b->get_blob().get_ondisk_length() >= b_off + b_len && b_off % chunk_size == 0 && b_len % chunk_size == 0 && b->get_blob().is_allocated(b_off, b_len)) { _apply_padding(head_pad, tail_pad, bl); dout(20) << __func__ << " reading head 0x" << std::hex << head_read << " and tail 0x" << tail_read << std::dec << dendl; if (head_read) { bufferlist head_bl; int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read, head_bl, 0); ceph_assert(r >= 0 && r <= (int)head_read); size_t zlen = head_read - r; if (zlen) { head_bl.append_zero(zlen); logger->inc(l_bluestore_write_pad_bytes, zlen); } head_bl.claim_append(bl); bl.swap(head_bl); logger->inc(l_bluestore_write_penalty_read_ops); } if (tail_read) { bufferlist tail_bl; int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read, tail_bl, 0); ceph_assert(r >= 0 && r <= (int)tail_read); size_t zlen = tail_read - r; if (zlen) { tail_bl.append_zero(zlen); logger->inc(l_bluestore_write_pad_bytes, zlen); } bl.claim_append(tail_bl); logger->inc(l_bluestore_write_penalty_read_ops); } logger->inc(l_bluestore_write_small_pre_read); _buffer_cache_write(txc, b, b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); b->dirty_blob().calc_csum(b_off, bl); if (!g_conf()->bluestore_debug_omit_block_device_write) { bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); op->op = bluestore_deferred_op_t::OP_WRITE; int r = b->get_blob().map( b_off, b_len, [&](uint64_t offset, uint64_t length) { op->extents.emplace_back(bluestore_pextent_t(offset, length)); return 0; }); ceph_assert(r == 0); op->data = std::move(bl); dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" << b_len << std::dec << " of mutable " << *b << " at " << op->extents << dendl; } Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, b, &wctx->old_extents); b->dirty_blob().mark_used(le->blob_offset, le->length); txc->statfs_delta.stored() += le->length; dout(20) << __func__ << " lex " << *le << dendl; return; } // try to reuse blob if we can if (b->can_reuse_blob(min_alloc_size, max_bsize, offset0 - bstart, &alloc_len)) { ceph_assert(alloc_len == min_alloc_size); // expecting data always // fit into reused blob // Need to check for pending writes desiring to // reuse the same pextent. The rationale is that during GC two chunks // from garbage blobs(compressed?) can share logical space within the same // AU. That's in turn might be caused by unaligned len in clone_range2. // Hence the second write will fail in an attempt to reuse blob at // do_alloc_write(). if (!wctx->has_conflict(b, offset0, offset0 + alloc_len, min_alloc_size)) { // we can't reuse pad_head/pad_tail since they might be truncated // due to existent extents uint64_t b_off = offset - bstart; uint64_t b_off0 = b_off; _pad_zeros(&bl, &b_off0, chunk_size); dout(20) << __func__ << " reuse blob " << *b << std::hex << " (0x" << b_off0 << "~" << bl.length() << ")" << " (0x" << b_off << "~" << length << ")" << std::dec << dendl; o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, false); logger->inc(l_bluestore_write_small_unused); return; } } } ++ep; end_ep = ep; any_change = true; } // if (ep != end && ep->logical_offset < offset + max_bsize) // check extent for reuse in reverse order if (prev_ep != end && prev_ep->logical_offset >= min_off) { BlobRef b = prev_ep->blob; if (!above_blob_threshold) { inspected_blobs.insert(&b->get_blob()); above_blob_threshold = inspected_blobs.size() >= blob_threshold; } start_ep = prev_ep; auto bstart = prev_ep->blob_start(); dout(20) << __func__ << " considering " << *b << " bstart 0x" << std::hex << bstart << std::dec << dendl; if (b->can_reuse_blob(min_alloc_size, max_bsize, offset0 - bstart, &alloc_len)) { ceph_assert(alloc_len == min_alloc_size); // expecting data always // fit into reused blob // Need to check for pending writes desiring to // reuse the same pextent. The rationale is that during GC two chunks // from garbage blobs(compressed?) can share logical space within the same // AU. That's in turn might be caused by unaligned len in clone_range2. // Hence the second write will fail in an attempt to reuse blob at // do_alloc_write(). if (!wctx->has_conflict(b, offset0, offset0 + alloc_len, min_alloc_size)) { uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); uint64_t b_off = offset - bstart; uint64_t b_off0 = b_off; _pad_zeros(&bl, &b_off0, chunk_size); dout(20) << __func__ << " reuse blob " << *b << std::hex << " (0x" << b_off0 << "~" << bl.length() << ")" << " (0x" << b_off << "~" << length << ")" << std::dec << dendl; o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, false); logger->inc(l_bluestore_write_small_unused); return; } } if (prev_ep != begin) { --prev_ep; any_change = true; } else { prev_ep = end; // to avoid useless first extent re-check } } // if (prev_ep != end && prev_ep->logical_offset >= min_off) } while (any_change); if (above_blob_threshold) { dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size() << " " << std::hex << min_off << "~" << max_off << std::dec << dendl; ceph_assert(start_ep != end_ep); for (auto ep = start_ep; ep != end_ep; ++ep) { dout(20) << __func__ << " inserting for GC " << std::hex << ep->logical_offset << "~" << ep->length << std::dec << dendl; wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length); } // insert newly written extent to GC wctx->extents_to_gc.union_insert(offset, length); dout(20) << __func__ << " inserting (last) for GC " << std::hex << offset << "~" << length << std::dec << dendl; } // new blob. BlobRef b = c->new_blob(); uint64_t b_off = p2phase(offset, alloc_len); uint64_t b_off0 = b_off; _pad_zeros(&bl, &b_off0, block_size); o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity // doesn't match disk one only true); return; } bool BlueStore::BigDeferredWriteContext::can_defer( BlueStore::extent_map_t::iterator ep, uint64_t prefer_deferred_size, uint64_t block_size, uint64_t offset, uint64_t l) { bool res = false; auto& blob = ep->blob->get_blob(); if (offset >= ep->blob_start() && blob.is_mutable()) { off = offset; b_off = offset - ep->blob_start(); uint64_t chunk_size = blob.get_chunk_size(block_size); uint64_t ondisk = blob.get_ondisk_length(); used = std::min(l, ondisk - b_off); // will read some data to fill out the chunk? head_read = p2phase(b_off, chunk_size); tail_read = p2nphase(b_off + used, chunk_size); b_off -= head_read; ceph_assert(b_off % chunk_size == 0); ceph_assert(blob_aligned_len() % chunk_size == 0); res = blob_aligned_len() < prefer_deferred_size && blob_aligned_len() <= ondisk && blob.is_allocated(b_off, blob_aligned_len()); if (res) { blob_ref = ep->blob; blob_start = ep->blob_start(); } } return res; } bool BlueStore::BigDeferredWriteContext::apply_defer() { int r = blob_ref->get_blob().map( b_off, blob_aligned_len(), [&](const bluestore_pextent_t& pext, uint64_t offset, uint64_t length) { // apply deferred if overwrite breaks blob continuity only. // if it totally overlaps some pextent - fallback to regular write if (pext.offset < offset || pext.end() > offset + length) { res_extents.emplace_back(bluestore_pextent_t(offset, length)); return 0; } return -1; }); return r >= 0; } void BlueStore::_do_write_big_apply_deferred( TransContext* txc, CollectionRef& c, OnodeRef o, BlueStore::BigDeferredWriteContext& dctx, bufferlist::iterator& blp, WriteContext* wctx) { bufferlist bl; dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read << " and tail 0x" << dctx.tail_read << std::dec << dendl; if (dctx.head_read) { int r = _do_read(c.get(), o, dctx.off - dctx.head_read, dctx.head_read, bl, 0); ceph_assert(r >= 0 && r <= (int)dctx.head_read); size_t zlen = dctx.head_read - r; if (zlen) { bl.append_zero(zlen); logger->inc(l_bluestore_write_pad_bytes, zlen); } logger->inc(l_bluestore_write_penalty_read_ops); } blp.copy(dctx.used, bl); if (dctx.tail_read) { bufferlist tail_bl; int r = _do_read(c.get(), o, dctx.off + dctx.used, dctx.tail_read, tail_bl, 0); ceph_assert(r >= 0 && r <= (int)dctx.tail_read); size_t zlen = dctx.tail_read - r; if (zlen) { tail_bl.append_zero(zlen); logger->inc(l_bluestore_write_pad_bytes, zlen); } bl.claim_append(tail_bl); logger->inc(l_bluestore_write_penalty_read_ops); } auto& b0 = dctx.blob_ref; _buffer_cache_write(txc, b0, dctx.b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); b0->dirty_blob().calc_csum(dctx.b_off, bl); Extent* le = o->extent_map.set_lextent(c, dctx.off, dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents); // in fact this is a no-op for big writes but left here to maintain // uniformity and avoid missing after some refactor. b0->dirty_blob().mark_used(le->blob_offset, le->length); txc->statfs_delta.stored() += le->length; if (!g_conf()->bluestore_debug_omit_block_device_write) { bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length()); op->op = bluestore_deferred_op_t::OP_WRITE; op->extents.swap(dctx.res_extents); op->data = std::move(bl); } } void BlueStore::_do_write_big( TransContext *txc, CollectionRef &c, OnodeRef o, uint64_t offset, uint64_t length, bufferlist::iterator& blp, WriteContext *wctx) { dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << " target_blob_size 0x" << wctx->target_blob_size << std::dec << " compress " << (int)wctx->compress << dendl; logger->inc(l_bluestore_write_big); logger->inc(l_bluestore_write_big_bytes, length); auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load(); while (length > 0) { bool new_blob = false; BlobRef b; uint32_t b_off = 0; uint32_t l = 0; //attempting to reuse existing blob if (!wctx->compress) { // enforce target blob alignment with max_bsize l = max_bsize - p2phase(offset, max_bsize); l = std::min(uint64_t(l), length); auto end = o->extent_map.extent_map.end(); dout(20) << __func__ << " may be defer: 0x" << std::hex << offset << "~" << l << std::dec << dendl; if (prefer_deferred_size_snapshot && l <= prefer_deferred_size_snapshot * 2) { // Single write that spans two adjusted existing blobs can result // in up to two deferred blocks of 'prefer_deferred_size' // So we're trying to minimize the amount of resulting blobs // and preserve 2 blobs rather than inserting one more in between // E.g. write 0x10000~20000 over existing blobs // (0x0~20000 and 0x20000~20000) is better (from subsequent reading // performance point of view) to result in two deferred writes to // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000 // look for an existing mutable blob we can write into auto ep = o->extent_map.seek_lextent(offset); auto ep_next = end; BigDeferredWriteContext head_info, tail_info; bool will_defer = ep != end ? head_info.can_defer(ep, prefer_deferred_size_snapshot, block_size, offset, l) : false; auto offset_next = offset + head_info.used; auto remaining = l - head_info.used; if (will_defer && remaining) { will_defer = false; if (remaining <= prefer_deferred_size_snapshot) { ep_next = o->extent_map.seek_lextent(offset_next); // check if we can defer remaining totally will_defer = ep_next == end ? false : tail_info.can_defer(ep_next, prefer_deferred_size_snapshot, block_size, offset_next, remaining); will_defer = will_defer && remaining == tail_info.used; } } if (will_defer) { dout(20) << __func__ << " " << *(head_info.blob_ref) << " deferring big " << std::hex << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")" << std::dec << " write via deferred" << dendl; if (remaining) { dout(20) << __func__ << " " << *(tail_info.blob_ref) << " deferring big " << std::hex << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")" << std::dec << " write via deferred" << dendl; } will_defer = head_info.apply_defer(); if (!will_defer) { dout(20) << __func__ << " deferring big fell back, head isn't continuous" << dendl; } else if (remaining) { will_defer = tail_info.apply_defer(); if (!will_defer) { dout(20) << __func__ << " deferring big fell back, tail isn't continuous" << dendl; } } } if (will_defer) { _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx); if (remaining) { _do_write_big_apply_deferred(txc, c, o, tail_info, blp, wctx); } dout(20) << __func__ << " defer big: 0x" << std::hex << offset << "~" << l << std::dec << dendl; offset += l; length -= l; logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1); logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1); continue; } } dout(20) << __func__ << " lookup for blocks to reuse..." << dendl; o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); // seek again as punch_hole could invalidate ep auto ep = o->extent_map.seek_lextent(offset); auto begin = o->extent_map.extent_map.begin(); auto prev_ep = end; if (ep != begin) { prev_ep = ep; --prev_ep; } auto min_off = offset >= max_bsize ? offset - max_bsize : 0; // search suitable extent in both forward and reverse direction in // [offset - target_max_blob_size, offset + target_max_blob_size] range // then check if blob can be reused via can_reuse_blob func. bool any_change; do { any_change = false; if (ep != end && ep->logical_offset < offset + max_bsize) { dout(20) << __func__ << " considering " << *ep << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl; if (offset >= ep->blob_start() && ep->blob->can_reuse_blob(min_alloc_size, max_bsize, offset - ep->blob_start(), &l)) { b = ep->blob; b_off = offset - ep->blob_start(); prev_ep = end; // to avoid check below dout(20) << __func__ << " reuse blob " << *b << std::hex << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; } else { ++ep; any_change = true; } } if (prev_ep != end && prev_ep->logical_offset >= min_off) { dout(20) << __func__ << " considering rev " << *prev_ep << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl; if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize, offset - prev_ep->blob_start(), &l)) { b = prev_ep->blob; b_off = offset - prev_ep->blob_start(); dout(20) << __func__ << " reuse blob " << *b << std::hex << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; } else if (prev_ep != begin) { --prev_ep; any_change = true; } else { prev_ep = end; // to avoid useless first extent re-check } } } while (b == nullptr && any_change); } else { // trying to utilize as longer chunk as permitted in case of compression. l = std::min(max_bsize, length); o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); } // if (!wctx->compress) if (b == nullptr) { b = c->new_blob(); b_off = 0; new_blob = true; } bufferlist t; blp.copy(l, t); wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob); dout(20) << __func__ << " schedule write big: 0x" << std::hex << offset << "~" << l << std::dec << (new_blob ? " new " : " reuse ") << *b << dendl; offset += l; length -= l; logger->inc(l_bluestore_write_big_blobs); } } int BlueStore::_do_alloc_write( TransContext *txc, CollectionRef coll, OnodeRef o, WriteContext *wctx) { dout(20) << __func__ << " txc " << txc << " " << wctx->writes.size() << " blobs" << dendl; if (wctx->writes.empty()) { return 0; } CompressorRef c; double crr = 0; if (wctx->compress) { c = select_option( "compression_algorithm", compressor, [&]() { string val; if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) { CompressorRef cp = compressor; if (!cp || cp->get_type_name() != val) { cp = Compressor::create(cct, val); if (!cp) { if (_set_compression_alert(false, val.c_str())) { derr << __func__ << " unable to initialize " << val.c_str() << " compressor" << dendl; } } } return boost::optional(cp); } return boost::optional(); } ); crr = select_option( "compression_required_ratio", cct->_conf->bluestore_compression_required_ratio, [&]() { double val; if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) { return boost::optional(val); } return boost::optional(); } ); } // checksum int64_t csum = csum_type.load(); csum = select_option( "csum_type", csum, [&]() { int64_t val; if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) { return boost::optional(val); } return boost::optional(); } ); // compress (as needed) and calc needed space uint64_t need = 0; uint64_t data_size = 0; // 'need' is amount of space that must be provided by allocator. // 'data_size' is a size of data that will be transferred to disk. // Note that data_size is always <= need. This comes from: // - write to blob was unaligned, and there is free space // - data has been compressed // // We make one decision and apply it to all blobs. // All blobs will be deferred or none will. // We assume that allocator does its best to provide contiguous space, // and the condition is : (data_size < deferred). auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); for (auto& wi : wctx->writes) { if (c && wi.blob_length > min_alloc_size) { auto start = mono_clock::now(); // compress ceph_assert(wi.b_off == 0); ceph_assert(wi.blob_length == wi.bl.length()); // FIXME: memory alignment here is bad bufferlist t; boost::optional compressor_message; int r = c->compress(wi.bl, t, compressor_message); uint64_t want_len_raw = wi.blob_length * crr; uint64_t want_len = p2roundup(want_len_raw, min_alloc_size); bool rejected = false; uint64_t compressed_len = t.length(); // do an approximate (fast) estimation for resulting blob size // that doesn't take header overhead into account uint64_t result_len = p2roundup(compressed_len, min_alloc_size); if (r == 0 && result_len <= want_len && result_len < wi.blob_length) { bluestore_compression_header_t chdr; chdr.type = c->get_type(); chdr.length = t.length(); chdr.compressor_message = compressor_message; encode(chdr, wi.compressed_bl); wi.compressed_bl.claim_append(t); compressed_len = wi.compressed_bl.length(); result_len = p2roundup(compressed_len, min_alloc_size); if (result_len <= want_len && result_len < wi.blob_length) { // Cool. We compressed at least as much as we were hoping to. // pad out to min_alloc_size wi.compressed_bl.append_zero(result_len - compressed_len); wi.compressed_len = compressed_len; wi.compressed = true; logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len); dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length << " -> 0x" << compressed_len << " => 0x" << result_len << " with " << c->get_type() << std::dec << dendl; txc->statfs_delta.compressed() += compressed_len; txc->statfs_delta.compressed_original() += wi.blob_length; txc->statfs_delta.compressed_allocated() += result_len; logger->inc(l_bluestore_compress_success_count); need += result_len; data_size += result_len; } else { rejected = true; } } else if (r != 0) { dout(5) << __func__ << std::hex << " 0x" << wi.blob_length << " bytes compressed using " << c->get_type_name() << std::dec << " failed with errcode = " << r << ", leaving uncompressed" << dendl; logger->inc(l_bluestore_compress_rejected_count); need += wi.blob_length; data_size += wi.bl.length(); } else { rejected = true; } if (rejected) { dout(20) << __func__ << std::hex << " 0x" << wi.blob_length << " compressed to 0x" << compressed_len << " -> 0x" << result_len << " with " << c->get_type() << ", which is more than required 0x" << want_len_raw << " -> 0x" << want_len << ", leaving uncompressed" << std::dec << dendl; logger->inc(l_bluestore_compress_rejected_count); need += wi.blob_length; data_size += wi.bl.length(); } log_latency("compress@_do_alloc_write", l_bluestore_compress_lat, mono_clock::now() - start, cct->_conf->bluestore_log_op_age ); } else { need += wi.blob_length; data_size += wi.bl.length(); } } PExtentVector prealloc; prealloc.reserve(2 * wctx->writes.size()); int64_t prealloc_left = 0; prealloc_left = shared_alloc.a->allocate( need, min_alloc_size, need, 0, &prealloc); if (prealloc_left < 0 || prealloc_left < (int64_t)need) { derr << __func__ << " failed to allocate 0x" << std::hex << need << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left) << " min_alloc_size 0x" << min_alloc_size << " available 0x " << shared_alloc.a->get_free() << std::dec << dendl; if (prealloc.size()) { shared_alloc.a->release(prealloc); } return -ENOSPC; } _collect_allocation_stats(need, min_alloc_size, prealloc.size()); if (bdev->is_smr()) { std::deque zones_to_clean; if (shared_alloc.a->zoned_get_zones_to_clean(&zones_to_clean)) { std::lock_guard l{zoned_cleaner_lock}; zoned_cleaner_queue.swap(zones_to_clean); zoned_cleaner_cond.notify_one(); } } dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size << " prealloc " << prealloc << dendl; auto prealloc_pos = prealloc.begin(); ceph_assert(prealloc_pos != prealloc.end()); for (auto& wi : wctx->writes) { bluestore_blob_t& dblob = wi.b->dirty_blob(); uint64_t b_off = wi.b_off; bufferlist *l = &wi.bl; uint64_t final_length = wi.blob_length; uint64_t csum_length = wi.blob_length; if (wi.compressed) { final_length = wi.compressed_bl.length(); csum_length = final_length; unsigned csum_order = ctz(csum_length); l = &wi.compressed_bl; dblob.set_compressed(wi.blob_length, wi.compressed_len); if (csum != Checksummer::CSUM_NONE) { dout(20) << __func__ << " initialize csum setting for compressed blob " << *wi.b << " csum_type " << Checksummer::get_csum_type_string(csum) << " csum_order " << csum_order << " csum_length 0x" << std::hex << csum_length << " blob_length 0x" << wi.blob_length << " compressed_length 0x" << wi.compressed_len << std::dec << dendl; dblob.init_csum(csum, csum_order, csum_length); } } else if (wi.new_blob) { unsigned csum_order; // initialize newly created blob only ceph_assert(dblob.is_mutable()); if (l->length() != wi.blob_length) { // hrm, maybe we could do better here, but let's not bother. dout(20) << __func__ << " forcing csum_order to block_size_order " << block_size_order << dendl; csum_order = block_size_order; } else { csum_order = std::min(wctx->csum_order, ctz(l->length())); } // try to align blob with max_blob_size to improve // its reuse ratio, e.g. in case of reverse write uint32_t suggested_boff = (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize; if ((suggested_boff % (1 << csum_order)) == 0 && suggested_boff + final_length <= max_bsize && suggested_boff > b_off) { dout(20) << __func__ << " forcing blob_offset to 0x" << std::hex << suggested_boff << std::dec << dendl; ceph_assert(suggested_boff >= b_off); csum_length += suggested_boff - b_off; b_off = suggested_boff; } if (csum != Checksummer::CSUM_NONE) { dout(20) << __func__ << " initialize csum setting for new blob " << *wi.b << " csum_type " << Checksummer::get_csum_type_string(csum) << " csum_order " << csum_order << " csum_length 0x" << std::hex << csum_length << std::dec << dendl; dblob.init_csum(csum, csum_order, csum_length); } } PExtentVector extents; int64_t left = final_length; auto prefer_deferred_size_snapshot = prefer_deferred_size.load(); while (left > 0) { ceph_assert(prealloc_left > 0); if (prealloc_pos->length <= left) { prealloc_left -= prealloc_pos->length; left -= prealloc_pos->length; txc->statfs_delta.allocated() += prealloc_pos->length; extents.push_back(*prealloc_pos); ++prealloc_pos; } else { extents.emplace_back(prealloc_pos->offset, left); prealloc_pos->offset += left; prealloc_pos->length -= left; prealloc_left -= left; txc->statfs_delta.allocated() += left; left = 0; break; } } for (auto& p : extents) { txc->allocated.insert(p.offset, p.length); } dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents); dout(20) << __func__ << " blob " << *wi.b << dendl; if (dblob.has_csum()) { dblob.calc_csum(b_off, *l); } if (wi.mark_unused) { ceph_assert(!dblob.is_compressed()); auto b_end = b_off + wi.bl.length(); if (b_off) { dblob.add_unused(0, b_off); } uint64_t llen = dblob.get_logical_length(); if (b_end < llen) { dblob.add_unused(b_end, llen - b_end); } } Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset, b_off + (wi.b_off0 - wi.b_off), wi.length0, wi.b, nullptr); wi.b->dirty_blob().mark_used(le->blob_offset, le->length); txc->statfs_delta.stored() += le->length; dout(20) << __func__ << " lex " << *le << dendl; _buffer_cache_write(txc, wi.b, b_off, wi.bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); // queue io if (!g_conf()->bluestore_debug_omit_block_device_write) { if (data_size < prefer_deferred_size_snapshot) { dout(20) << __func__ << " deferring 0x" << std::hex << l->length() << std::dec << " write via deferred" << dendl; bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length()); op->op = bluestore_deferred_op_t::OP_WRITE; int r = wi.b->get_blob().map( b_off, l->length(), [&](uint64_t offset, uint64_t length) { op->extents.emplace_back(bluestore_pextent_t(offset, length)); return 0; }); ceph_assert(r == 0); op->data = *l; } else { wi.b->get_blob().map_bl( b_off, *l, [&](uint64_t offset, bufferlist& t) { bdev->aio_write(offset, t, &txc->ioc, false); }); logger->inc(l_bluestore_write_new); } } } ceph_assert(prealloc_pos == prealloc.end()); ceph_assert(prealloc_left == 0); return 0; } void BlueStore::_wctx_finish( TransContext *txc, CollectionRef& c, OnodeRef o, WriteContext *wctx, set *maybe_unshared_blobs) { auto oep = wctx->old_extents.begin(); while (oep != wctx->old_extents.end()) { auto &lo = *oep; oep = wctx->old_extents.erase(oep); dout(20) << __func__ << " lex_old " << lo.e << dendl; BlobRef b = lo.e.blob; const bluestore_blob_t& blob = b->get_blob(); if (blob.is_compressed()) { if (lo.blob_empty) { txc->statfs_delta.compressed() -= blob.get_compressed_payload_length(); } txc->statfs_delta.compressed_original() -= lo.e.length; } auto& r = lo.r; txc->statfs_delta.stored() -= lo.e.length; if (!r.empty()) { dout(20) << __func__ << " blob " << *b << " release " << r << dendl; if (blob.is_shared()) { PExtentVector final; c->load_shared_blob(b->shared_blob); bool unshare = false; bool* unshare_ptr = !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare; for (auto e : r) { b->shared_blob->put_ref( e.offset, e.length, &final, unshare_ptr); } if (unshare) { ceph_assert(maybe_unshared_blobs); maybe_unshared_blobs->insert(b->shared_blob.get()); } dout(20) << __func__ << " shared_blob release " << final << " from " << *b->shared_blob << dendl; txc->write_shared_blob(b->shared_blob); r.clear(); r.swap(final); } } // we can't invalidate our logical extents as we drop them because // other lextents (either in our onode or others) may still // reference them. but we can throw out anything that is no // longer allocated. Note that this will leave behind edge bits // that are no longer referenced but not deallocated (until they // age out of the cache naturally). b->discard_unallocated(c.get()); for (auto e : r) { dout(20) << __func__ << " release " << e << dendl; txc->released.insert(e.offset, e.length); txc->statfs_delta.allocated() -= e.length; if (blob.is_compressed()) { txc->statfs_delta.compressed_allocated() -= e.length; } } if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) { dout(20) << __func__ << " spanning_blob_map removing empty " << *b << dendl; o->extent_map.spanning_blob_map.erase(b->id); } delete &lo; } } void BlueStore::_do_write_data( TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset, uint64_t length, bufferlist& bl, WriteContext *wctx) { uint64_t end = offset + length; bufferlist::iterator p = bl.begin(); if (offset / min_alloc_size == (end - 1) / min_alloc_size && (length != min_alloc_size)) { // we fall within the same block _do_write_small(txc, c, o, offset, length, p, wctx); } else { uint64_t head_offset, head_length; uint64_t middle_offset, middle_length; uint64_t tail_offset, tail_length; head_offset = offset; head_length = p2nphase(offset, min_alloc_size); tail_offset = p2align(end, min_alloc_size); tail_length = p2phase(end, min_alloc_size); middle_offset = head_offset + head_length; middle_length = length - head_length - tail_length; if (head_length) { _do_write_small(txc, c, o, head_offset, head_length, p, wctx); } _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); if (tail_length) { _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx); } } } void BlueStore::_choose_write_options( CollectionRef& c, OnodeRef o, uint32_t fadvise_flags, WriteContext *wctx) { if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { dout(20) << __func__ << " will do buffered write" << dendl; wctx->buffered = true; } else if (cct->_conf->bluestore_default_buffered_write && (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { dout(20) << __func__ << " defaulting to buffered write" << dendl; wctx->buffered = true; } // apply basic csum block size wctx->csum_order = block_size_order; // compression parameters unsigned alloc_hints = o->onode.alloc_hint_flags; auto cm = select_option( "compression_mode", comp_mode.load(), [&]() { string val; if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) { return boost::optional( Compressor::get_comp_mode_type(val)); } return boost::optional(); } ); wctx->compress = (cm != Compressor::COMP_NONE) && ((cm == Compressor::COMP_FORCE) || (cm == Compressor::COMP_AGGRESSIVE && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) || (cm == Compressor::COMP_PASSIVE && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE))); if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 && (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE | CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) { dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl; if (o->onode.expected_write_size) { wctx->csum_order = std::max(min_alloc_size_order, (uint8_t)ctz(o->onode.expected_write_size)); } else { wctx->csum_order = min_alloc_size_order; } if (wctx->compress) { wctx->target_blob_size = select_option( "compression_max_blob_size", comp_max_blob_size.load(), [&]() { int64_t val; if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) { return boost::optional((uint64_t)val); } return boost::optional(); } ); } } else { if (wctx->compress) { wctx->target_blob_size = select_option( "compression_min_blob_size", comp_min_blob_size.load(), [&]() { int64_t val; if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) { return boost::optional((uint64_t)val); } return boost::optional(); } ); } } uint64_t max_bsize = max_blob_size.load(); if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) { wctx->target_blob_size = max_bsize; } // set the min blob size floor at 2x the min_alloc_size, or else we // won't be able to allocate a smaller extent for the compressed // data. if (wctx->compress && wctx->target_blob_size < min_alloc_size * 2) { wctx->target_blob_size = min_alloc_size * 2; } dout(20) << __func__ << " prefer csum_order " << wctx->csum_order << " target_blob_size 0x" << std::hex << wctx->target_blob_size << " compress=" << (int)wctx->compress << " buffered=" << (int)wctx->buffered << std::dec << dendl; } int BlueStore::_do_gc( TransContext *txc, CollectionRef& c, OnodeRef o, const WriteContext& wctx, uint64_t *dirty_start, uint64_t *dirty_end) { bool dirty_range_updated = false; WriteContext wctx_gc; wctx_gc.fork(wctx); // make a clone for garbage collection auto & extents_to_collect = wctx.extents_to_gc; for (auto it = extents_to_collect.begin(); it != extents_to_collect.end(); ++it) { bufferlist bl; auto offset = (*it).first; auto length = (*it).second; dout(20) << __func__ << " processing " << std::hex << offset << "~" << length << std::dec << dendl; int r = _do_read(c.get(), o, offset, length, bl, 0); ceph_assert(r == (int)length); _do_write_data(txc, c, o, offset, length, bl, &wctx_gc); logger->inc(l_bluestore_gc_merged, length); if (*dirty_start > offset) { *dirty_start = offset; dirty_range_updated = true; } if (*dirty_end < offset + length) { *dirty_end = offset + length; dirty_range_updated = true; } } if (dirty_range_updated) { o->extent_map.fault_range(db, *dirty_start, *dirty_end); } dout(30) << __func__ << " alloc write" << dendl; int r = _do_alloc_write(txc, c, o, &wctx_gc); if (r < 0) { derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) << dendl; return r; } _wctx_finish(txc, c, o, &wctx_gc); return 0; } int BlueStore::_do_write( TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset, uint64_t length, bufferlist& bl, uint32_t fadvise_flags) { int r = 0; dout(20) << __func__ << " " << o->oid << " 0x" << std::hex << offset << "~" << length << " - have 0x" << o->onode.size << " (" << std::dec << o->onode.size << ")" << " bytes" << std::hex << " fadvise_flags 0x" << fadvise_flags << " alloc_hint 0x" << o->onode.alloc_hint_flags << " expected_object_size " << o->onode.expected_object_size << " expected_write_size " << o->onode.expected_write_size << std::dec << dendl; _dump_onode<30>(cct, *o); if (length == 0) { return 0; } uint64_t end = offset + length; GarbageCollector gc(c->store->cct); int64_t benefit = 0; auto dirty_start = offset; auto dirty_end = end; WriteContext wctx; _choose_write_options(c, o, fadvise_flags, &wctx); o->extent_map.fault_range(db, offset, length); _do_write_data(txc, c, o, offset, length, bl, &wctx); r = _do_alloc_write(txc, c, o, &wctx); if (r < 0) { derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) << dendl; goto out; } if (wctx.extents_to_gc.empty() || wctx.extents_to_gc.range_start() > offset || wctx.extents_to_gc.range_end() < offset + length) { benefit = gc.estimate(offset, length, o->extent_map, wctx.old_extents, min_alloc_size); } if (bdev->is_smr()) { if (wctx.old_extents.empty()) { txc->zoned_note_new_object(o); } else { int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset; txc->zoned_note_updated_object(o, old_ondisk_offset); } } // NB: _wctx_finish() will empty old_extents // so we must do gc estimation before that _wctx_finish(txc, c, o, &wctx); if (end > o->onode.size) { dout(20) << __func__ << " extending size to 0x" << std::hex << end << std::dec << dendl; o->onode.size = end; } if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) { wctx.extents_to_gc.union_of(gc.get_extents_to_collect()); dout(20) << __func__ << " perform garbage collection for compressed extents, " << "expected benefit = " << benefit << " AUs" << dendl; } if (!wctx.extents_to_gc.empty()) { dout(20) << __func__ << " perform garbage collection" << dendl; r = _do_gc(txc, c, o, wctx, &dirty_start, &dirty_end); if (r < 0) { derr << __func__ << " _do_gc failed with " << cpp_strerror(r) << dendl; goto out; } dout(20)<<__func__<<" gc range is " << std::hex << dirty_start << "~" << dirty_end - dirty_start << std::dec << dendl; } o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start); o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start); r = 0; out: return r; } int BlueStore::_write(TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset, size_t length, bufferlist& bl, uint32_t fadvise_flags) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; int r = 0; if (offset + length >= OBJECT_MAX_SIZE) { r = -E2BIG; } else { _assign_nid(txc, o); r = _do_write(txc, c, o, offset, length, bl, fadvise_flags); txc->write_onode(o); } dout(10) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec << " = " << r << dendl; return r; } int BlueStore::_zero(TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset, size_t length) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; int r = 0; if (offset + length >= OBJECT_MAX_SIZE) { r = -E2BIG; } else { _assign_nid(txc, o); r = _do_zero(txc, c, o, offset, length); } dout(10) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec << " = " << r << dendl; return r; } int BlueStore::_do_zero(TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset, size_t length) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; int r = 0; _dump_onode<30>(cct, *o); WriteContext wctx; o->extent_map.fault_range(db, offset, length); o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); o->extent_map.dirty_range(offset, length); _wctx_finish(txc, c, o, &wctx); if (length > 0 && offset + length > o->onode.size) { o->onode.size = offset + length; dout(20) << __func__ << " extending size to " << offset + length << dendl; } txc->write_onode(o); dout(10) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec << " = " << r << dendl; return r; } void BlueStore::_do_truncate( TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset, set *maybe_unshared_blobs) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << std::dec << dendl; _dump_onode<30>(cct, *o); if (offset == o->onode.size) return; WriteContext wctx; if (offset < o->onode.size) { uint64_t length = o->onode.size - offset; o->extent_map.fault_range(db, offset, length); o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); o->extent_map.dirty_range(offset, length); _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs); // if we have shards past EOF, ask for a reshard if (!o->onode.extent_map_shards.empty() && o->onode.extent_map_shards.back().offset >= offset) { dout(10) << __func__ << " request reshard past EOF" << dendl; if (offset) { o->extent_map.request_reshard(offset - 1, offset + length); } else { o->extent_map.request_reshard(0, length); } } } o->onode.size = offset; if (bdev->is_smr()) { // On zoned devices, we currently support only removing an object or // truncating it to zero size, both of which fall through this code path. ceph_assert(offset == 0 && !wctx.old_extents.empty()); int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset; txc->zoned_note_truncated_object(o, ondisk_offset); } txc->write_onode(o); } int BlueStore::_truncate(TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << std::dec << dendl; int r = 0; if (offset >= OBJECT_MAX_SIZE) { r = -E2BIG; } else { _do_truncate(txc, c, o, offset); } dout(10) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << std::dec << " = " << r << dendl; return r; } int BlueStore::_do_remove( TransContext *txc, CollectionRef& c, OnodeRef o) { set maybe_unshared_blobs; bool is_gen = !o->oid.is_no_gen(); _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr); if (o->onode.has_omap()) { o->flush(); _do_omap_clear(txc, o); } o->exists = false; string key; for (auto &s : o->extent_map.shards) { dout(20) << __func__ << " removing shard 0x" << std::hex << s.shard_info->offset << std::dec << dendl; generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key, [&](const string& final_key) { txc->t->rmkey(PREFIX_OBJ, final_key); } ); } txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size()); txc->note_removed_object(o); o->extent_map.clear(); o->onode = bluestore_onode_t(); _debug_obj_on_delete(o->oid); if (!is_gen || maybe_unshared_blobs.empty()) { return 0; } // see if we can unshare blobs still referenced by the head dout(10) << __func__ << " gen and maybe_unshared_blobs " << maybe_unshared_blobs << dendl; ghobject_t nogen = o->oid; nogen.generation = ghobject_t::NO_GEN; OnodeRef h = c->get_onode(nogen, false); if (!h || !h->exists) { return 0; } dout(20) << __func__ << " checking for unshareable blobs on " << h << " " << h->oid << dendl; map expect; for (auto& e : h->extent_map.extent_map) { const bluestore_blob_t& b = e.blob->get_blob(); SharedBlob *sb = e.blob->shared_blob.get(); if (b.is_shared() && sb->loaded && maybe_unshared_blobs.count(sb)) { if (b.is_compressed()) { expect[sb].get(0, b.get_ondisk_length()); } else { b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) { expect[sb].get(off, len); return 0; }); } } } vector unshared_blobs; unshared_blobs.reserve(maybe_unshared_blobs.size()); for (auto& p : expect) { dout(20) << " ? " << *p.first << " vs " << p.second << dendl; if (p.first->persistent->ref_map == p.second) { SharedBlob *sb = p.first; dout(20) << __func__ << " unsharing " << *sb << dendl; unshared_blobs.push_back(sb); txc->unshare_blob(sb); uint64_t sbid = c->make_blob_unshared(sb); string key; get_shared_blob_key(sbid, &key); txc->t->rmkey(PREFIX_SHARED_BLOB, key); } } if (unshared_blobs.empty()) { return 0; } for (auto& e : h->extent_map.extent_map) { const bluestore_blob_t& b = e.blob->get_blob(); SharedBlob *sb = e.blob->shared_blob.get(); if (b.is_shared() && std::find(unshared_blobs.begin(), unshared_blobs.end(), sb) != unshared_blobs.end()) { dout(20) << __func__ << " unsharing " << e << dendl; bluestore_blob_t& blob = e.blob->dirty_blob(); blob.clear_flag(bluestore_blob_t::FLAG_SHARED); h->extent_map.dirty_range(e.logical_offset, 1); } } txc->write_onode(h); return 0; } int BlueStore::_remove(TransContext *txc, CollectionRef& c, OnodeRef &o) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " onode " << o.get() << " txc "<< txc << dendl; auto start_time = mono_clock::now(); int r = _do_remove(txc, c, o); log_latency_fn( __func__, l_bluestore_remove_lat, mono_clock::now() - start_time, cct->_conf->bluestore_log_op_age, [&](const ceph::timespan& lat) { ostringstream ostr; ostr << ", lat = " << timespan_str(lat) << " cid =" << c->cid << " oid =" << o->oid; return ostr.str(); } ); dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } int BlueStore::_setattr(TransContext *txc, CollectionRef& c, OnodeRef& o, const string& name, bufferptr& val) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " " << name << " (" << val.length() << " bytes)" << dendl; int r = 0; if (val.is_partial()) { auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length()); b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); } else { auto& b = o->onode.attrs[name.c_str()] = val; b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); } txc->write_onode(o); dout(10) << __func__ << " " << c->cid << " " << o->oid << " " << name << " (" << val.length() << " bytes)" << " = " << r << dendl; return r; } int BlueStore::_setattrs(TransContext *txc, CollectionRef& c, OnodeRef& o, const map& aset) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " " << aset.size() << " keys" << dendl; int r = 0; for (map::const_iterator p = aset.begin(); p != aset.end(); ++p) { if (p->second.is_partial()) { auto& b = o->onode.attrs[p->first.c_str()] = bufferptr(p->second.c_str(), p->second.length()); b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); } else { auto& b = o->onode.attrs[p->first.c_str()] = p->second; b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); } } txc->write_onode(o); dout(10) << __func__ << " " << c->cid << " " << o->oid << " " << aset.size() << " keys" << " = " << r << dendl; return r; } int BlueStore::_rmattr(TransContext *txc, CollectionRef& c, OnodeRef& o, const string& name) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " " << name << dendl; int r = 0; auto it = o->onode.attrs.find(name.c_str()); if (it == o->onode.attrs.end()) goto out; o->onode.attrs.erase(it); txc->write_onode(o); out: dout(10) << __func__ << " " << c->cid << " " << o->oid << " " << name << " = " << r << dendl; return r; } int BlueStore::_rmattrs(TransContext *txc, CollectionRef& c, OnodeRef& o) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; int r = 0; if (o->onode.attrs.empty()) goto out; o->onode.attrs.clear(); txc->write_onode(o); out: dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o) { const string& omap_prefix = o->get_omap_prefix(); string prefix, tail; o->get_omap_header(&prefix); o->get_omap_tail(&tail); txc->t->rm_range_keys(omap_prefix, prefix, tail); txc->t->rmkey(omap_prefix, tail); dout(20) << __func__ << " remove range start: " << pretty_binary_string(prefix) << " end: " << pretty_binary_string(tail) << dendl; } int BlueStore::_omap_clear(TransContext *txc, CollectionRef& c, OnodeRef& o) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; int r = 0; if (o->onode.has_omap()) { o->flush(); _do_omap_clear(txc, o); o->onode.clear_omap_flag(); txc->write_onode(o); } dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } int BlueStore::_omap_setkeys(TransContext *txc, CollectionRef& c, OnodeRef& o, bufferlist &bl) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; int r; auto p = bl.cbegin(); __u32 num; if (!o->onode.has_omap()) { if (o->oid.is_pgmeta()) { o->onode.set_omap_flags_pgmeta(); } else { o->onode.set_omap_flags(per_pool_omap == OMAP_BULK); } txc->write_onode(o); const string& prefix = o->get_omap_prefix(); string key_tail; bufferlist tail; o->get_omap_tail(&key_tail); txc->t->set(prefix, key_tail, tail); } else { txc->note_modified_object(o); } const string& prefix = o->get_omap_prefix(); string final_key; o->get_omap_key(string(), &final_key); size_t base_key_len = final_key.size(); decode(num, p); while (num--) { string key; bufferlist value; decode(key, p); decode(value, p); final_key.resize(base_key_len); // keep prefix final_key += key; dout(20) << __func__ << " " << pretty_binary_string(final_key) << " <- " << key << dendl; txc->t->set(prefix, final_key, value); } r = 0; dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } int BlueStore::_omap_setheader(TransContext *txc, CollectionRef& c, OnodeRef &o, bufferlist& bl) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; int r; string key; if (!o->onode.has_omap()) { if (o->oid.is_pgmeta()) { o->onode.set_omap_flags_pgmeta(); } else { o->onode.set_omap_flags(per_pool_omap == OMAP_BULK); } txc->write_onode(o); const string& prefix = o->get_omap_prefix(); string key_tail; bufferlist tail; o->get_omap_tail(&key_tail); txc->t->set(prefix, key_tail, tail); } else { txc->note_modified_object(o); } const string& prefix = o->get_omap_prefix(); o->get_omap_header(&key); txc->t->set(prefix, key, bl); r = 0; dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } int BlueStore::_omap_rmkeys(TransContext *txc, CollectionRef& c, OnodeRef& o, bufferlist& bl) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; int r = 0; auto p = bl.cbegin(); __u32 num; string final_key; if (!o->onode.has_omap()) { goto out; } { const string& prefix = o->get_omap_prefix(); o->get_omap_key(string(), &final_key); size_t base_key_len = final_key.size(); decode(num, p); while (num--) { string key; decode(key, p); final_key.resize(base_key_len); // keep prefix final_key += key; dout(20) << __func__ << " rm " << pretty_binary_string(final_key) << " <- " << key << dendl; txc->t->rmkey(prefix, final_key); } } txc->note_modified_object(o); out: dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } int BlueStore::_omap_rmkey_range(TransContext *txc, CollectionRef& c, OnodeRef& o, const string& first, const string& last) { dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; string key_first, key_last; int r = 0; if (!o->onode.has_omap()) { goto out; } { const string& prefix = o->get_omap_prefix(); o->flush(); o->get_omap_key(first, &key_first); o->get_omap_key(last, &key_last); txc->t->rm_range_keys(prefix, key_first, key_last); dout(20) << __func__ << " remove range start: " << pretty_binary_string(key_first) << " end: " << pretty_binary_string(key_last) << dendl; } txc->note_modified_object(o); out: dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; return r; } int BlueStore::_set_alloc_hint( TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t expected_object_size, uint64_t expected_write_size, uint32_t flags) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " flags " << ceph_osd_alloc_hint_flag_string(flags) << dendl; int r = 0; o->onode.expected_object_size = expected_object_size; o->onode.expected_write_size = expected_write_size; o->onode.alloc_hint_flags = flags; txc->write_onode(o); dout(10) << __func__ << " " << c->cid << " " << o->oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " flags " << ceph_osd_alloc_hint_flag_string(flags) << " = " << r << dendl; return r; } int BlueStore::_clone(TransContext *txc, CollectionRef& c, OnodeRef& oldo, OnodeRef& newo) { dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " << newo->oid << dendl; int r = 0; if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) { derr << __func__ << " mismatched hash on " << oldo->oid << " and " << newo->oid << dendl; return -EINVAL; } _assign_nid(txc, newo); // clone data oldo->flush(); _do_truncate(txc, c, newo, 0); if (cct->_conf->bluestore_clone_cow) { _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0); } else { bufferlist bl; r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0); if (r < 0) goto out; r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0); if (r < 0) goto out; } // clone attrs newo->onode.attrs = oldo->onode.attrs; // clone omap if (newo->onode.has_omap()) { dout(20) << __func__ << " clearing old omap data" << dendl; newo->flush(); _do_omap_clear(txc, newo); newo->onode.clear_omap_flag(); } if (oldo->onode.has_omap()) { dout(20) << __func__ << " copying omap data" << dendl; if (newo->oid.is_pgmeta()) { newo->onode.set_omap_flags_pgmeta(); } else { newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK); } const string& prefix = newo->get_omap_prefix(); string head, tail; oldo->get_omap_header(&head); oldo->get_omap_tail(&tail); KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail}); it->lower_bound(head); while (it->valid()) { if (it->key() >= tail) { dout(30) << __func__ << " reached tail" << dendl; break; } else { dout(30) << __func__ << " got header/data " << pretty_binary_string(it->key()) << dendl; string key; newo->rewrite_omap_key(it->key(), &key); txc->t->set(prefix, key, it->value()); } it->next(); } string new_tail; bufferlist new_tail_value; newo->get_omap_tail(&new_tail); txc->t->set(prefix, new_tail, new_tail_value); } txc->write_onode(newo); r = 0; out: dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " << newo->oid << " = " << r << dendl; return r; } int BlueStore::_do_clone_range( TransContext *txc, CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t srcoff, uint64_t length, uint64_t dstoff) { dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " << newo->oid << " 0x" << std::hex << srcoff << "~" << length << " -> " << " 0x" << dstoff << "~" << length << std::dec << dendl; oldo->extent_map.fault_range(db, srcoff, length); newo->extent_map.fault_range(db, dstoff, length); _dump_onode<30>(cct, *oldo); _dump_onode<30>(cct, *newo); oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff); _dump_onode<30>(cct, *oldo); _dump_onode<30>(cct, *newo); return 0; } int BlueStore::_clone_range(TransContext *txc, CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t srcoff, uint64_t length, uint64_t dstoff) { dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " << newo->oid << " from 0x" << std::hex << srcoff << "~" << length << " to offset 0x" << dstoff << std::dec << dendl; int r = 0; if (srcoff + length >= OBJECT_MAX_SIZE || dstoff + length >= OBJECT_MAX_SIZE) { r = -E2BIG; goto out; } if (srcoff + length > oldo->onode.size) { r = -EINVAL; goto out; } _assign_nid(txc, newo); if (length > 0) { if (cct->_conf->bluestore_clone_cow) { _do_zero(txc, c, newo, dstoff, length); _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff); } else { bufferlist bl; r = _do_read(c.get(), oldo, srcoff, length, bl, 0); if (r < 0) goto out; r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0); if (r < 0) goto out; } } txc->write_onode(newo); r = 0; out: dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " << newo->oid << " from 0x" << std::hex << srcoff << "~" << length << " to offset 0x" << dstoff << std::dec << " = " << r << dendl; return r; } int BlueStore::_rename(TransContext *txc, CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, const ghobject_t& new_oid) { dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " << new_oid << dendl; int r; ghobject_t old_oid = oldo->oid; mempool::bluestore_cache_meta::string new_okey; if (newo) { if (newo->exists) { r = -EEXIST; goto out; } ceph_assert(txc->onodes.count(newo) == 0); } txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size()); // rewrite shards { oldo->extent_map.fault_range(db, 0, oldo->onode.size); get_object_key(cct, new_oid, &new_okey); string key; for (auto &s : oldo->extent_map.shards) { generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key, [&](const string& final_key) { txc->t->rmkey(PREFIX_OBJ, final_key); } ); s.dirty = true; } } newo = oldo; txc->write_onode(newo); // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty // Onode in the old slot c->onode_map.rename(oldo, old_oid, new_oid, new_okey); r = 0; // hold a ref to new Onode in old name position, to ensure we don't drop // it from the cache before this txc commits (or else someone may come along // and read newo's metadata via the old name). txc->note_modified_object(oldo); out: dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " << new_oid << " = " << r << dendl; return r; } // collections int BlueStore::_create_collection( TransContext *txc, const coll_t &cid, unsigned bits, CollectionRef *c) { dout(15) << __func__ << " " << cid << " bits " << bits << dendl; int r; bufferlist bl; { std::unique_lock l(coll_lock); if (*c) { r = -EEXIST; goto out; } auto p = new_coll_map.find(cid); ceph_assert(p != new_coll_map.end()); *c = p->second; (*c)->cnode.bits = bits; coll_map[cid] = *c; new_coll_map.erase(p); } encode((*c)->cnode, bl); txc->t->set(PREFIX_COLL, stringify(cid), bl); r = 0; out: dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl; return r; } int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid, CollectionRef *c) { dout(15) << __func__ << " " << cid << dendl; int r; (*c)->flush_all_but_last(); { std::unique_lock l(coll_lock); if (!*c) { r = -ENOENT; goto out; } size_t nonexistent_count = 0; ceph_assert((*c)->exists); if ((*c)->onode_map.map_any([&](Onode* o) { if (o->exists) { dout(1) << __func__ << " " << o->oid << " " << o << " exists in onode_map" << dendl; return true; } ++nonexistent_count; return false; })) { r = -ENOTEMPTY; goto out; } vector ls; ghobject_t next; // Enumerate onodes in db, up to nonexistent_count + 1 // then check if all of them are marked as non-existent. // Bypass the check if (next != ghobject_t::get_max()) r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(), nonexistent_count + 1, false, &ls, &next); if (r >= 0) { // If true mean collecton has more objects than nonexistent_count, // so bypass check. bool exists = (!next.is_max()); for (auto it = ls.begin(); !exists && it < ls.end(); ++it) { dout(10) << __func__ << " oid " << *it << dendl; auto onode = (*c)->onode_map.lookup(*it); exists = !onode || onode->exists; if (exists) { dout(1) << __func__ << " " << *it << " exists in db, " << (!onode ? "not present in ram" : "present in ram") << dendl; } } if (!exists) { _do_remove_collection(txc, c); r = 0; } else { dout(10) << __func__ << " " << cid << " is non-empty" << dendl; r = -ENOTEMPTY; } } } out: dout(10) << __func__ << " " << cid << " = " << r << dendl; return r; } void BlueStore::_do_remove_collection(TransContext *txc, CollectionRef *c) { coll_map.erase((*c)->cid); txc->removed_collections.push_back(*c); (*c)->exists = false; _osr_register_zombie((*c)->osr.get()); txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid)); c->reset(); } int BlueStore::_split_collection(TransContext *txc, CollectionRef& c, CollectionRef& d, unsigned bits, int rem) { dout(15) << __func__ << " " << c->cid << " to " << d->cid << " " << " bits " << bits << dendl; std::unique_lock l(c->lock); std::unique_lock l2(d->lock); int r; // flush all previous deferred writes on this sequencer. this is a bit // heavyweight, but we need to make sure all deferred writes complete // before we split as the new collection's sequencer may need to order // this after those writes, and we don't bother with the complexity of // moving those TransContexts over to the new osr. _osr_drain_preceding(txc); // move any cached items (onodes and referenced shared blobs) that will // belong to the child collection post-split. leave everything else behind. // this may include things that don't strictly belong to the now-smaller // parent split, but the OSD will always send us a split for every new // child. spg_t pgid, dest_pgid; bool is_pg = c->cid.is_pg(&pgid); ceph_assert(is_pg); is_pg = d->cid.is_pg(&dest_pgid); ceph_assert(is_pg); // the destination should initially be empty. ceph_assert(d->onode_map.empty()); ceph_assert(d->shared_blob_set.empty()); ceph_assert(d->cnode.bits == bits); c->split_cache(d.get()); // adjust bits. note that this will be redundant for all but the first // split call for this parent (first child). c->cnode.bits = bits; ceph_assert(d->cnode.bits == bits); r = 0; bufferlist bl; encode(c->cnode, bl); txc->t->set(PREFIX_COLL, stringify(c->cid), bl); dout(10) << __func__ << " " << c->cid << " to " << d->cid << " " << " bits " << bits << " = " << r << dendl; return r; } int BlueStore::_merge_collection( TransContext *txc, CollectionRef *c, CollectionRef& d, unsigned bits) { dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid << " bits " << bits << dendl; std::unique_lock l((*c)->lock); std::unique_lock l2(d->lock); int r; coll_t cid = (*c)->cid; // flush all previous deferred writes on the source collection to ensure // that all deferred writes complete before we merge as the target collection's // sequencer may need to order new ops after those writes. _osr_drain((*c)->osr.get()); // move any cached items (onodes and referenced shared blobs) that will // belong to the child collection post-split. leave everything else behind. // this may include things that don't strictly belong to the now-smaller // parent split, but the OSD will always send us a split for every new // child. spg_t pgid, dest_pgid; bool is_pg = cid.is_pg(&pgid); ceph_assert(is_pg); is_pg = d->cid.is_pg(&dest_pgid); ceph_assert(is_pg); // adjust bits. note that this will be redundant for all but the first // merge call for the parent/target. d->cnode.bits = bits; // behavior depends on target (d) bits, so this after that is updated. (*c)->split_cache(d.get()); // remove source collection { std::unique_lock l3(coll_lock); _do_remove_collection(txc, c); } r = 0; bufferlist bl; encode(d->cnode, bl); txc->t->set(PREFIX_COLL, stringify(d->cid), bl); dout(10) << __func__ << " " << cid << " to " << d->cid << " " << " bits " << bits << " = " << r << dendl; return r; } void BlueStore::log_latency( const char* name, int idx, const ceph::timespan& l, double lat_threshold, const char* info) const { logger->tinc(idx, l); if (lat_threshold > 0.0 && l >= make_timespan(lat_threshold)) { dout(0) << __func__ << " slow operation observed for " << name << ", latency = " << l << info << dendl; } } void BlueStore::log_latency_fn( const char* name, int idx, const ceph::timespan& l, double lat_threshold, std::function fn) const { logger->tinc(idx, l); if (lat_threshold > 0.0 && l >= make_timespan(lat_threshold)) { dout(0) << __func__ << " slow operation observed for " << name << ", latency = " << l << fn(l) << dendl; } } #if defined(WITH_LTTNG) void BlueStore::BlueStoreThrottle::emit_initial_tracepoint( KeyValueDB &db, TransContext &txc, mono_clock::time_point start_throttle_acquire) { pending_kv_ios += txc.ios; if (txc.deferred_txn) { pending_deferred_ios += txc.ios; } uint64_t started = 0; uint64_t completed = 0; if (should_trace(&started, &completed)) { txc.tracing = true; uint64_t rocksdb_base_level, rocksdb_estimate_pending_compaction_bytes, rocksdb_cur_size_all_mem_tables, rocksdb_compaction_pending, rocksdb_mem_table_flush_pending, rocksdb_num_running_compactions, rocksdb_num_running_flushes, rocksdb_actual_delayed_write_rate; db.get_property( "rocksdb.base-level", &rocksdb_base_level); db.get_property( "rocksdb.estimate-pending-compaction-bytes", &rocksdb_estimate_pending_compaction_bytes); db.get_property( "rocksdb.cur-size-all-mem-tables", &rocksdb_cur_size_all_mem_tables); db.get_property( "rocksdb.compaction-pending", &rocksdb_compaction_pending); db.get_property( "rocksdb.mem-table-flush-pending", &rocksdb_mem_table_flush_pending); db.get_property( "rocksdb.num-running-compactions", &rocksdb_num_running_compactions); db.get_property( "rocksdb.num-running-flushes", &rocksdb_num_running_flushes); db.get_property( "rocksdb.actual-delayed-write-rate", &rocksdb_actual_delayed_write_rate); tracepoint( bluestore, transaction_initial_state, txc.osr->get_sequencer_id(), txc.seq, throttle_bytes.get_current(), throttle_deferred_bytes.get_current(), pending_kv_ios, pending_deferred_ios, started, completed, ceph::to_seconds(mono_clock::now() - start_throttle_acquire)); tracepoint( bluestore, transaction_initial_state_rocksdb, txc.osr->get_sequencer_id(), txc.seq, rocksdb_base_level, rocksdb_estimate_pending_compaction_bytes, rocksdb_cur_size_all_mem_tables, rocksdb_compaction_pending, rocksdb_mem_table_flush_pending, rocksdb_num_running_compactions, rocksdb_num_running_flushes, rocksdb_actual_delayed_write_rate); } } #endif mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency( TransContext &txc, PerfCounters *logger, int state) { mono_clock::time_point now = mono_clock::now(); mono_clock::duration lat = now - txc.last_stamp; logger->tinc(state, lat); #if defined(WITH_LTTNG) if (txc.tracing && state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) { OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state)); tracepoint( bluestore, transaction_state_duration, txc.osr->get_sequencer_id(), txc.seq, state, ceph::to_seconds(lat)); } #endif txc.last_stamp = now; return lat; } bool BlueStore::BlueStoreThrottle::try_start_transaction( KeyValueDB &db, TransContext &txc, mono_clock::time_point start_throttle_acquire) { throttle_bytes.get(txc.cost); if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) { emit_initial_tracepoint(db, txc, start_throttle_acquire); return true; } else { return false; } } void BlueStore::BlueStoreThrottle::finish_start_transaction( KeyValueDB &db, TransContext &txc, mono_clock::time_point start_throttle_acquire) { ceph_assert(txc.deferred_txn); throttle_deferred_bytes.get(txc.cost); emit_initial_tracepoint(db, txc, start_throttle_acquire); } #if defined(WITH_LTTNG) void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc) { pending_kv_ios -= 1; ios_completed_since_last_traced++; if (txc.tracing) { tracepoint( bluestore, transaction_commit_latency, txc.osr->get_sequencer_id(), txc.seq, ceph::to_seconds(mono_clock::now() - txc.start)); } } #endif #if defined(WITH_LTTNG) void BlueStore::BlueStoreThrottle::complete(TransContext &txc) { if (txc.deferred_txn) { pending_deferred_ios -= 1; } if (txc.tracing) { mono_clock::time_point now = mono_clock::now(); mono_clock::duration lat = now - txc.start; tracepoint( bluestore, transaction_total_duration, txc.osr->get_sequencer_id(), txc.seq, ceph::to_seconds(lat)); } } #endif // DB key value Histogram #define KEY_SLAB 32 #define VALUE_SLAB 64 const string prefix_onode = "o"; const string prefix_onode_shard = "x"; const string prefix_other = "Z"; int BlueStore::DBHistogram::get_key_slab(size_t sz) { return (sz/KEY_SLAB); } string BlueStore::DBHistogram::get_key_slab_to_range(int slab) { int lower_bound = slab * KEY_SLAB; int upper_bound = (slab + 1) * KEY_SLAB; string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")"; return ret; } int BlueStore::DBHistogram::get_value_slab(size_t sz) { return (sz/VALUE_SLAB); } string BlueStore::DBHistogram::get_value_slab_to_range(int slab) { int lower_bound = slab * VALUE_SLAB; int upper_bound = (slab + 1) * VALUE_SLAB; string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")"; return ret; } void BlueStore::DBHistogram::update_hist_entry(map > &key_hist, const string &prefix, size_t key_size, size_t value_size) { uint32_t key_slab = get_key_slab(key_size); uint32_t value_slab = get_value_slab(value_size); key_hist[prefix][key_slab].count++; key_hist[prefix][key_slab].max_len = std::max(key_size, key_hist[prefix][key_slab].max_len); key_hist[prefix][key_slab].val_map[value_slab].count++; key_hist[prefix][key_slab].val_map[value_slab].max_len = std::max(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len); } void BlueStore::DBHistogram::dump(Formatter *f) { f->open_object_section("rocksdb_value_distribution"); for (auto i : value_hist) { f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second); } f->close_section(); f->open_object_section("rocksdb_key_value_histogram"); for (auto i : key_hist) { f->dump_string("prefix", i.first); f->open_object_section("key_hist"); for ( auto k : i.second) { f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count); f->dump_unsigned("max_len", k.second.max_len); f->open_object_section("value_hist"); for ( auto j : k.second.val_map) { f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count); f->dump_unsigned("max_len", j.second.max_len); } f->close_section(); } f->close_section(); } f->close_section(); } //Itrerates through the db and collects the stats void BlueStore::generate_db_histogram(Formatter *f) { //globals uint64_t num_onodes = 0; uint64_t num_shards = 0; uint64_t num_super = 0; uint64_t num_coll = 0; uint64_t num_omap = 0; uint64_t num_pgmeta_omap = 0; uint64_t num_deferred = 0; uint64_t num_alloc = 0; uint64_t num_stat = 0; uint64_t num_others = 0; uint64_t num_shared_shards = 0; size_t max_key_size =0, max_value_size = 0; uint64_t total_key_size = 0, total_value_size = 0; size_t key_size = 0, value_size = 0; DBHistogram hist; auto start = coarse_mono_clock::now(); KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator(); iter->seek_to_first(); while (iter->valid()) { dout(30) << __func__ << " Key: " << iter->key() << dendl; key_size = iter->key_size(); value_size = iter->value_size(); hist.value_hist[hist.get_value_slab(value_size)]++; max_key_size = std::max(max_key_size, key_size); max_value_size = std::max(max_value_size, value_size); total_key_size += key_size; total_value_size += value_size; pair key(iter->raw_key()); if (key.first == PREFIX_SUPER) { hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size); num_super++; } else if (key.first == PREFIX_STAT) { hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size); num_stat++; } else if (key.first == PREFIX_COLL) { hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size); num_coll++; } else if (key.first == PREFIX_OBJ) { if (key.second.back() == ONODE_KEY_SUFFIX) { hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size); num_onodes++; } else { hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size); num_shards++; } } else if (key.first == PREFIX_OMAP) { hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size); num_omap++; } else if (key.first == PREFIX_PERPOOL_OMAP) { hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size); num_omap++; } else if (key.first == PREFIX_PERPG_OMAP) { hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size); num_omap++; } else if (key.first == PREFIX_PGMETA_OMAP) { hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size); num_pgmeta_omap++; } else if (key.first == PREFIX_DEFERRED) { hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size); num_deferred++; } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) { hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size); num_alloc++; } else if (key.first == PREFIX_SHARED_BLOB) { hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size); num_shared_shards++; } else { hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size); num_others++; } iter->next(); } ceph::timespan duration = coarse_mono_clock::now() - start; f->open_object_section("rocksdb_key_value_stats"); f->dump_unsigned("num_onodes", num_onodes); f->dump_unsigned("num_shards", num_shards); f->dump_unsigned("num_super", num_super); f->dump_unsigned("num_coll", num_coll); f->dump_unsigned("num_omap", num_omap); f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap); f->dump_unsigned("num_deferred", num_deferred); f->dump_unsigned("num_alloc", num_alloc); f->dump_unsigned("num_stat", num_stat); f->dump_unsigned("num_shared_shards", num_shared_shards); f->dump_unsigned("num_others", num_others); f->dump_unsigned("max_key_size", max_key_size); f->dump_unsigned("max_value_size", max_value_size); f->dump_unsigned("total_key_size", total_key_size); f->dump_unsigned("total_value_size", total_value_size); f->close_section(); hist.dump(f); dout(20) << __func__ << " finished in " << duration << " seconds" << dendl; } void BlueStore::_shutdown_cache() { dout(10) << __func__ << dendl; for (auto i : buffer_cache_shards) { i->flush(); ceph_assert(i->empty()); } for (auto& p : coll_map) { p.second->onode_map.clear(); if (!p.second->shared_blob_set.empty()) { derr << __func__ << " stray shared blobs on " << p.first << dendl; p.second->shared_blob_set.dump<0>(cct); } ceph_assert(p.second->onode_map.empty()); ceph_assert(p.second->shared_blob_set.empty()); } coll_map.clear(); for (auto i : onode_cache_shards) { ceph_assert(i->empty()); } } // For external caller. // We use a best-effort policy instead, e.g., // we don't care if there are still some pinned onodes/data in the cache // after this command is completed. int BlueStore::flush_cache(ostream *os) { dout(10) << __func__ << dendl; for (auto i : onode_cache_shards) { i->flush(); } for (auto i : buffer_cache_shards) { i->flush(); } return 0; } void BlueStore::_apply_padding(uint64_t head_pad, uint64_t tail_pad, bufferlist& padded) { if (head_pad) { padded.prepend_zero(head_pad); } if (tail_pad) { padded.append_zero(tail_pad); } if (head_pad || tail_pad) { dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad << " tail 0x" << tail_pad << std::dec << dendl; logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad); } } void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn) { // finalize extent_map shards o->extent_map.update(txn, false); if (o->extent_map.needs_reshard()) { o->extent_map.reshard(db, txn); o->extent_map.update(txn, true); if (o->extent_map.needs_reshard()) { dout(20) << __func__ << " warning: still wants reshard, check options?" << dendl; o->extent_map.clear_needs_reshard(); } logger->inc(l_bluestore_onode_reshard); } // bound encode size_t bound = 0; denc(o->onode, bound); o->extent_map.bound_encode_spanning_blobs(bound); if (o->onode.extent_map_shards.empty()) { denc(o->extent_map.inline_bl, bound); } // encode bufferlist bl; unsigned onode_part, blob_part, extent_part; { auto p = bl.get_contiguous_appender(bound, true); denc(o->onode, p); onode_part = p.get_logical_offset(); o->extent_map.encode_spanning_blobs(p); blob_part = p.get_logical_offset() - onode_part; if (o->onode.extent_map_shards.empty()) { denc(o->extent_map.inline_bl, p); } extent_part = p.get_logical_offset() - onode_part - blob_part; } dout(20) << __func__ << " onode " << o->oid << " is " << bl.length() << " (" << onode_part << " bytes onode + " << blob_part << " bytes spanning blobs + " << extent_part << " bytes inline extents)" << dendl; txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl); } void BlueStore::_log_alerts(osd_alert_list_t& alerts) { std::lock_guard l(qlock); if (!spurious_read_errors_alert.empty() && cct->_conf->bluestore_warn_on_spurious_read_errors) { alerts.emplace( "BLUESTORE_SPURIOUS_READ_ERRORS", spurious_read_errors_alert); } if (!disk_size_mismatch_alert.empty()) { alerts.emplace( "BLUESTORE_DISK_SIZE_MISMATCH", disk_size_mismatch_alert); } if (!legacy_statfs_alert.empty()) { alerts.emplace( "BLUESTORE_LEGACY_STATFS", legacy_statfs_alert); } if (!spillover_alert.empty() && cct->_conf->bluestore_warn_on_bluefs_spillover) { alerts.emplace( "BLUEFS_SPILLOVER", spillover_alert); } if (!no_per_pg_omap_alert.empty()) { alerts.emplace( "BLUESTORE_NO_PER_PG_OMAP", no_per_pg_omap_alert); } if (!no_per_pool_omap_alert.empty()) { alerts.emplace( "BLUESTORE_NO_PER_POOL_OMAP", no_per_pool_omap_alert); } string s0(failed_cmode); if (!failed_compressors.empty()) { if (!s0.empty()) { s0 += ", "; } s0 += "unable to load:"; bool first = true; for (auto& s : failed_compressors) { if (first) { first = false; } else { s0 += ", "; } s0 += s; } alerts.emplace( "BLUESTORE_NO_COMPRESSION", s0); } } void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size, size_t extents) { alloc_stats_count++; alloc_stats_fragments += extents; alloc_stats_size += need; } void BlueStore::_record_allocation_stats() { // don't care about data consistency, // fields can be partially modified while making the tuple auto t0 = std::make_tuple( alloc_stats_count.exchange(0), alloc_stats_fragments.exchange(0), alloc_stats_size.exchange(0)); dout(0) << " allocation stats probe " << probe_count << ":" << " cnt: " << std::get<0>(t0) << " frags: " << std::get<1>(t0) << " size: " << std::get<2>(t0) << dendl; // // Keep the history for probes from the power-of-two sequence: // -1, -2, -4, -8, -16 // size_t base = 1; for (auto& t : alloc_stats_history) { dout(0) << " probe -" << base + (probe_count % base) << ": " << std::get<0>(t) << ", " << std::get<1>(t) << ", " << std::get<2>(t) << dendl; base <<= 1; } dout(0) << "------------" << dendl; ++ probe_count; for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) { if ((probe_count % (1 << i)) == 0) { alloc_stats_history[i] = alloc_stats_history[i - 1]; } } alloc_stats_history[0].swap(t0); } // =========================================== // BlueStoreRepairer size_t BlueStoreRepairer::StoreSpaceTracker::filter_out( const interval_set& extents) { ceph_assert(granularity); // initialized // can't call for the second time ceph_assert(!was_filtered_out); ceph_assert(collections_bfs.size() == objects_bfs.size()); uint64_t prev_pos = 0; uint64_t npos = collections_bfs.size(); bloom_vector collections_reduced; bloom_vector objects_reduced; for (auto e : extents) { if (e.second == 0) { continue; } uint64_t pos = max(e.first / granularity, prev_pos); uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity; while (pos != npos && pos < end_pos) { ceph_assert( collections_bfs[pos].element_count() == objects_bfs[pos].element_count()); if (collections_bfs[pos].element_count()) { collections_reduced.push_back(std::move(collections_bfs[pos])); objects_reduced.push_back(std::move(objects_bfs[pos])); } ++pos; } prev_pos = end_pos; } collections_reduced.swap(collections_bfs); objects_reduced.swap(objects_bfs); was_filtered_out = true; return collections_bfs.size(); } bool BlueStoreRepairer::remove_key(KeyValueDB *db, const string& prefix, const string& key) { std::lock_guard l(lock); if (!remove_key_txn) { remove_key_txn = db->get_transaction(); } ++to_repair_cnt; remove_key_txn->rmkey(prefix, key); return true; } void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val) { std::lock_guard l(lock); // possibly redundant ceph_assert(fix_per_pool_omap_txn == nullptr); fix_per_pool_omap_txn = db->get_transaction(); ++to_repair_cnt; bufferlist bl; bl.append(stringify(val)); fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl); } bool BlueStoreRepairer::fix_shared_blob( KeyValueDB::Transaction txn, uint64_t sbid, bluestore_extent_ref_map_t* ref_map, size_t repaired) { string key; get_shared_blob_key(sbid, &key); if (ref_map) { bluestore_shared_blob_t persistent(sbid, std::move(*ref_map)); bufferlist bl; encode(persistent, bl); txn->set(PREFIX_SHARED_BLOB, key, bl); } else { txn->rmkey(PREFIX_SHARED_BLOB, key); } to_repair_cnt += repaired; return true; } bool BlueStoreRepairer::fix_statfs(KeyValueDB *db, const string& key, const store_statfs_t& new_statfs) { std::lock_guard l(lock); if (!fix_statfs_txn) { fix_statfs_txn = db->get_transaction(); } BlueStore::volatile_statfs vstatfs; vstatfs = new_statfs; bufferlist bl; vstatfs.encode(bl); ++to_repair_cnt; fix_statfs_txn->set(PREFIX_STAT, key, bl); return true; } bool BlueStoreRepairer::fix_leaked(KeyValueDB *db, FreelistManager* fm, uint64_t offset, uint64_t len) { std::lock_guard l(lock); if (!fix_fm_leaked_txn) { fix_fm_leaked_txn = db->get_transaction(); } ++to_repair_cnt; fm->release(offset, len, fix_fm_leaked_txn); return true; } bool BlueStoreRepairer::fix_false_free(KeyValueDB *db, FreelistManager* fm, uint64_t offset, uint64_t len) { std::lock_guard l(lock); if (!fix_fm_false_free_txn) { fix_fm_false_free_txn = db->get_transaction(); } ++to_repair_cnt; fm->allocate(offset, len, fix_fm_false_free_txn); return true; } bool BlueStoreRepairer::fix_spanning_blobs( KeyValueDB* db, std::function f) { std::lock_guard l(lock); if (!fix_onode_txn) { fix_onode_txn = db->get_transaction(); } f(fix_onode_txn); ++to_repair_cnt; return true; } bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db) { //NB: not for use in multithreading mode!!! if (misreferenced_extents.size()) { size_t n = space_usage_tracker.filter_out(misreferenced_extents); ceph_assert(n > 0); if (!fix_misreferences_txn) { fix_misreferences_txn = db->get_transaction(); } return true; } return false; } unsigned BlueStoreRepairer::apply(KeyValueDB* db) { //NB: not for use in multithreading mode!!! if (fix_per_pool_omap_txn) { db->submit_transaction_sync(fix_per_pool_omap_txn); fix_per_pool_omap_txn = nullptr; } if (fix_fm_leaked_txn) { db->submit_transaction_sync(fix_fm_leaked_txn); fix_fm_leaked_txn = nullptr; } if (fix_fm_false_free_txn) { db->submit_transaction_sync(fix_fm_false_free_txn); fix_fm_false_free_txn = nullptr; } if (remove_key_txn) { db->submit_transaction_sync(remove_key_txn); remove_key_txn = nullptr; } if (fix_misreferences_txn) { db->submit_transaction_sync(fix_misreferences_txn); fix_misreferences_txn = nullptr; } if (fix_onode_txn) { db->submit_transaction_sync(fix_onode_txn); fix_onode_txn = nullptr; } if (fix_shared_blob_txn) { db->submit_transaction_sync(fix_shared_blob_txn); fix_shared_blob_txn = nullptr; } if (fix_statfs_txn) { db->submit_transaction_sync(fix_statfs_txn); fix_statfs_txn = nullptr; } if (need_compact) { db->compact(); need_compact = false; } unsigned repaired = to_repair_cnt; to_repair_cnt = 0; return repaired; } // ======================================================= // RocksDBBlueFSVolumeSelector uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { ceph_assert(h != nullptr); uint64_t hint = reinterpret_cast(h); uint8_t res; switch (hint) { case LEVEL_SLOW: res = BlueFS::BDEV_SLOW; if (db_avail4slow > 0) { // considering statically available db space vs. // - observed maximums on DB dev for DB/WAL/UNSORTED data // - observed maximum spillovers uint64_t max_db_use = 0; // max db usage we potentially observed max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST); max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); // this could go to db hence using it in the estimation max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; uint64_t avail = min( db_avail4slow, max_db_use < db_total ? db_total - max_db_use : 0); // considering current DB dev usage for SLOW data if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) { res = BlueFS::BDEV_DB; } } break; case LEVEL_LOG: case LEVEL_WAL: res = BlueFS::BDEV_WAL; break; case LEVEL_DB: default: res = BlueFS::BDEV_DB; break; } return res; } void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const { auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST]; res.emplace_back(base, db_size); auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST]; if (slow_size == 0) { slow_size = db_size; } res.emplace_back(base + ".slow", slow_size); } void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const { uint8_t res = LEVEL_DB; if (dirname.length() > 5) { // the "db.slow" and "db.wal" directory names are hard-coded at // match up with bluestore. the slow device is always the second // one (when a dedicated block.db device is present and used at // bdev 0). the wal device is always last. if (boost::algorithm::ends_with(dirname, ".slow")) { res = LEVEL_SLOW; } else if (boost::algorithm::ends_with(dirname, ".wal")) { res = LEVEL_WAL; } } return reinterpret_cast(res); } void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { auto max_x = per_level_per_dev_usage.get_max_x(); auto max_y = per_level_per_dev_usage.get_max_y(); sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST] << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST] << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST] << ", db_avail:" << db_avail4slow << std::endl << "Usage matrix:" << std::endl; constexpr std::array names{ { "DEV/LEV", "WAL", "DB", "SLOW", "*", "*", "REAL", "FILES", } }; const size_t width = 12; for (size_t i = 0; i < names.size(); ++i) { sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); sout << names[i]; } sout << std::endl; for (size_t l = 0; l < max_y; l++) { sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); switch (l + LEVEL_FIRST) { case LEVEL_LOG: sout << "LOG"; break; case LEVEL_WAL: sout << "WAL"; break; case LEVEL_DB: sout << "DB"; break; case LEVEL_SLOW: sout << "SLOW"; break; case LEVEL_MAX: sout << "TOTALS"; break; } for (size_t d = 0; d < max_x; d++) { sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l))); } sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); sout << stringify(per_level_files[l]) << std::endl; } ceph_assert(max_x == per_level_per_dev_max.get_max_x()); ceph_assert(max_y == per_level_per_dev_max.get_max_y()); sout << "MAXIMUMS:" << std::endl; for (size_t l = 0; l < max_y; l++) { sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); switch (l + LEVEL_FIRST) { case LEVEL_LOG: sout << "LOG"; break; case LEVEL_WAL: sout << "WAL"; break; case LEVEL_DB: sout << "DB"; break; case LEVEL_SLOW: sout << "SLOW"; break; case LEVEL_MAX: sout << "TOTALS"; break; } for (size_t d = 0; d < max_x - 1; d++) { sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l))); } sout.setf(std::ios::left, std::ios::adjustfield); sout.width(width); sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l))); if (l < max_y - 1) { sout << std::endl; } } } // ======================================================= // =======================================================