summaryrefslogtreecommitdiffstats
path: root/src/os/bluestore/bluestore_types.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/os/bluestore/bluestore_types.h')
-rw-r--r--src/os/bluestore/bluestore_types.h1368
1 files changed, 1368 insertions, 0 deletions
diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h
new file mode 100644
index 000000000..b21531bfe
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.h
@@ -0,0 +1,1368 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+
+#include <ostream>
+#include <type_traits>
+#include <vector>
+#include <array>
+#include "include/mempool.h"
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+#include "include/mempool.h"
+#include "include/ceph_hash.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+/// label for block device
+struct bluestore_bdev_label_t {
+ uuid_d osd_uuid; ///< osd uuid
+ uint64_t size = 0; ///< device size
+ utime_t btime; ///< birth time
+ std::string description; ///< device description
+
+ std::map<std::string,std::string> meta; ///< {read,write}_meta() content from ObjectStore
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_bdev_label_t*>& o);
+};
+WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_bdev_label_t& l);
+
+/// collection metadata
+struct bluestore_cnode_t {
+ uint32_t bits; ///< how many bits of coll pgid are significant
+
+ explicit bluestore_cnode_t(int b=0) : bits(b) {}
+
+ DENC(bluestore_cnode_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.bits, p);
+ DENC_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_cnode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_cnode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_cnode_t& l);
+
+template <typename OFFS_TYPE, typename LEN_TYPE>
+struct bluestore_interval_t
+{
+ static const uint64_t INVALID_OFFSET = ~0ull;
+
+ OFFS_TYPE offset = 0;
+ LEN_TYPE length = 0;
+
+ bluestore_interval_t(){}
+ bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+
+ bool is_valid() const {
+ return offset != INVALID_OFFSET;
+ }
+ uint64_t end() const {
+ return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
+ }
+
+ bool operator==(const bluestore_interval_t& other) const {
+ return offset == other.offset && length == other.length;
+ }
+
+};
+
+/// pextent: physical extent
+struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t>
+{
+ bluestore_pextent_t() {}
+ bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
+ bluestore_pextent_t(const bluestore_interval_t &ext) :
+ bluestore_interval_t(ext.offset, ext.length) {}
+
+ DENC(bluestore_pextent_t, v, p) {
+ denc_lba(v.offset, p);
+ denc_varint_lowz(v.length, p);
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_pextent_t*>& ls);
+};
+WRITE_CLASS_DENC(bluestore_pextent_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_pextent_t& o);
+
+typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector;
+
+template<>
+struct denc_traits<PExtentVector> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const PExtentVector& v, size_t& p) {
+ p += sizeof(uint32_t);
+ const auto size = v.size();
+ if (size) {
+ size_t per = 0;
+ denc(v.front(), per);
+ p += per * size;
+ }
+ }
+ static void encode(const PExtentVector& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc_varint(v.size(), p);
+ for (auto& i : v) {
+ denc(i, p);
+ }
+ }
+ static void decode(PExtentVector& v, ceph::buffer::ptr::const_iterator& p) {
+ unsigned num;
+ denc_varint(num, p);
+ v.clear();
+ v.resize(num);
+ for (unsigned i=0; i<num; ++i) {
+ denc(v[i], p);
+ }
+ }
+};
+
+/// extent_map: a std::map of reference counted extents
+struct bluestore_extent_ref_map_t {
+ struct record_t {
+ uint32_t length;
+ uint32_t refs;
+ record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
+ DENC(bluestore_extent_ref_map_t::record_t, v, p) {
+ denc_varint_lowz(v.length, p);
+ denc_varint(v.refs, p);
+ }
+ };
+
+ typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t;
+ map_t ref_map;
+
+ void _check() const;
+ void _maybe_merge_left(map_t::iterator& p);
+
+ void clear() {
+ ref_map.clear();
+ }
+ bool empty() const {
+ return ref_map.empty();
+ }
+
+ void get(uint64_t offset, uint32_t len);
+ void put(uint64_t offset, uint32_t len, PExtentVector *release,
+ bool *maybe_unshared);
+
+ bool contains(uint64_t offset, uint32_t len) const;
+ bool intersects(uint64_t offset, uint32_t len) const;
+
+ void bound_encode(size_t& p) const {
+ denc_varint((uint32_t)0, p);
+ if (!ref_map.empty()) {
+ size_t elem_size = 0;
+ denc_varint_lowz((uint64_t)0, elem_size);
+ ref_map.begin()->second.bound_encode(elem_size);
+ p += elem_size * ref_map.size();
+ }
+ }
+ void encode(ceph::buffer::list::contiguous_appender& p) const {
+ const uint32_t n = ref_map.size();
+ denc_varint(n, p);
+ if (n) {
+ auto i = ref_map.begin();
+ denc_varint_lowz(i->first, p);
+ i->second.encode(p);
+ int64_t pos = i->first;
+ while (++i != ref_map.end()) {
+ denc_varint_lowz((int64_t)i->first - pos, p);
+ i->second.encode(p);
+ pos = i->first;
+ }
+ }
+ }
+ void decode(ceph::buffer::ptr::const_iterator& p) {
+ uint32_t n;
+ denc_varint(n, p);
+ if (n) {
+ int64_t pos;
+ denc_varint_lowz(pos, p);
+ ref_map[pos].decode(p);
+ while (--n) {
+ int64_t delta;
+ denc_varint_lowz(delta, p);
+ pos += delta;
+ ref_map[pos].decode(p);
+ }
+ }
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_extent_ref_map_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
+
+
+std::ostream& operator<<(std::ostream& out, const bluestore_extent_ref_map_t& rm);
+static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
+ const bluestore_extent_ref_map_t::record_t& r) {
+ return l.length == r.length && l.refs == r.refs;
+}
+static inline bool operator==(const bluestore_extent_ref_map_t& l,
+ const bluestore_extent_ref_map_t& r) {
+ return l.ref_map == r.ref_map;
+}
+static inline bool operator!=(const bluestore_extent_ref_map_t& l,
+ const bluestore_extent_ref_map_t& r) {
+ return !(l == r);
+}
+
+/// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
+struct bluestore_blob_use_tracker_t {
+ // N.B.: There is no need to minimize au_size/num_au
+ // as much as possible (e.g. have just a single byte for au_size) since:
+ // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
+ // 2) Mem manager has its own granularity, most probably >= 8 bytes
+ //
+ uint32_t au_size; // Allocation (=tracking) unit size,
+ // == 0 if uninitialized
+ uint32_t num_au; // Amount of allocation units tracked
+ // == 0 if single unit or the whole blob is tracked
+ uint32_t alloc_au; // Amount of allocation units allocated
+
+ union {
+ uint32_t* bytes_per_au;
+ uint32_t total_bytes;
+ };
+
+ bluestore_blob_use_tracker_t()
+ : au_size(0), num_au(0), alloc_au(0), bytes_per_au(nullptr) {
+ }
+ bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t& tracker);
+ bluestore_blob_use_tracker_t& operator=(const bluestore_blob_use_tracker_t& rhs);
+ ~bluestore_blob_use_tracker_t() {
+ clear();
+ }
+
+ void clear() {
+ release(alloc_au, bytes_per_au);
+ num_au = 0;
+ alloc_au = 0;
+ bytes_per_au = 0;
+ au_size = 0;
+ }
+
+ uint32_t get_referenced_bytes() const {
+ uint32_t total = 0;
+ if (!num_au) {
+ total = total_bytes;
+ } else {
+ for (size_t i = 0; i < num_au; ++i) {
+ total += bytes_per_au[i];
+ }
+ }
+ return total;
+ }
+ bool is_not_empty() const {
+ if (!num_au) {
+ return total_bytes != 0;
+ } else {
+ for (size_t i = 0; i < num_au; ++i) {
+ if (bytes_per_au[i]) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ bool is_empty() const {
+ return !is_not_empty();
+ }
+ void prune_tail(uint32_t new_len) {
+ if (num_au) {
+ new_len = round_up_to(new_len, au_size);
+ uint32_t _num_au = new_len / au_size;
+ ceph_assert(_num_au <= num_au);
+ if (_num_au) {
+ num_au = _num_au; // bytes_per_au array is left unmodified
+ } else {
+ clear();
+ }
+ }
+ }
+ void add_tail(uint32_t new_len, uint32_t _au_size) {
+ auto full_size = au_size * (num_au ? num_au : 1);
+ ceph_assert(new_len >= full_size);
+ if (new_len == full_size) {
+ return;
+ }
+ if (!num_au) {
+ uint32_t old_total = total_bytes;
+ total_bytes = 0;
+ init(new_len, _au_size);
+ ceph_assert(num_au);
+ bytes_per_au[0] = old_total;
+ } else {
+ ceph_assert(_au_size == au_size);
+ new_len = round_up_to(new_len, au_size);
+ uint32_t _num_au = new_len / au_size;
+ ceph_assert(_num_au >= num_au);
+ if (_num_au > num_au) {
+ auto old_bytes = bytes_per_au;
+ auto old_num_au = num_au;
+ auto old_alloc_au = alloc_au;
+ alloc_au = num_au = 0; // to bypass an assertion in allocate()
+ bytes_per_au = nullptr;
+ allocate(_num_au);
+ for (size_t i = 0; i < old_num_au; i++) {
+ bytes_per_au[i] = old_bytes[i];
+ }
+ for (size_t i = old_num_au; i < num_au; i++) {
+ bytes_per_au[i] = 0;
+ }
+ release(old_alloc_au, old_bytes);
+ }
+ }
+ }
+
+ void init(
+ uint32_t full_length,
+ uint32_t _au_size);
+
+ void get(
+ uint32_t offset,
+ uint32_t len);
+
+ /// put: return true if the blob has no references any more after the call,
+ /// no release_units is filled for the sake of performance.
+ /// return false if there are some references to the blob,
+ /// in this case release_units contains pextents
+ /// (identified by their offsets relative to the blob start)
+ /// that are not used any more and can be safely deallocated.
+ bool put(
+ uint32_t offset,
+ uint32_t len,
+ PExtentVector *release);
+
+ bool can_split() const;
+ bool can_split_at(uint32_t blob_offset) const;
+ void split(
+ uint32_t blob_offset,
+ bluestore_blob_use_tracker_t* r);
+
+ bool equal(
+ const bluestore_blob_use_tracker_t& other) const;
+
+ void bound_encode(size_t& p) const {
+ denc_varint(au_size, p);
+ if (au_size) {
+ denc_varint(num_au, p);
+ if (!num_au) {
+ denc_varint(total_bytes, p);
+ } else {
+ size_t elem_size = 0;
+ denc_varint((uint32_t)0, elem_size);
+ p += elem_size * num_au;
+ }
+ }
+ }
+ void encode(ceph::buffer::list::contiguous_appender& p) const {
+ denc_varint(au_size, p);
+ if (au_size) {
+ denc_varint(num_au, p);
+ if (!num_au) {
+ denc_varint(total_bytes, p);
+ } else {
+ size_t elem_size = 0;
+ denc_varint((uint32_t)0, elem_size);
+ for (size_t i = 0; i < num_au; ++i) {
+ denc_varint(bytes_per_au[i], p);
+ }
+ }
+ }
+ }
+ void decode(ceph::buffer::ptr::const_iterator& p) {
+ clear();
+ denc_varint(au_size, p);
+ if (au_size) {
+ uint32_t _num_au;
+ denc_varint(_num_au, p);
+ if (!_num_au) {
+ num_au = 0;
+ denc_varint(total_bytes, p);
+ } else {
+ allocate(_num_au);
+ for (size_t i = 0; i < _num_au; ++i) {
+ denc_varint(bytes_per_au[i], p);
+ }
+ }
+ }
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_blob_use_tracker_t*>& o);
+private:
+ void allocate(uint32_t _num_au);
+ void release(uint32_t _num_au, uint32_t* ptr);
+};
+WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
+std::ostream& operator<<(std::ostream& out, const bluestore_blob_use_tracker_t& rm);
+
+/// blob: a piece of data on disk
+struct bluestore_blob_t {
+private:
+ PExtentVector extents; ///< raw data position on device
+ uint32_t logical_length = 0; ///< original length of data stored in the blob
+ uint32_t compressed_length = 0; ///< compressed length if any
+
+public:
+ enum {
+ LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split
+ FLAG_COMPRESSED = 2, ///< blob is compressed
+ FLAG_CSUM = 4, ///< blob has checksums
+ FLAG_HAS_UNUSED = 8, ///< blob has unused std::map
+ FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob
+ };
+ static std::string get_flags_string(unsigned flags);
+
+ uint32_t flags = 0; ///< FLAG_*
+
+ typedef uint16_t unused_t;
+ unused_t unused = 0; ///< portion that has never been written to (bitmap)
+
+ uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_*
+ uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes
+
+ ceph::buffer::ptr csum_data; ///< opaque std::vector of csum data
+
+ bluestore_blob_t(uint32_t f = 0) : flags(f) {}
+
+ const PExtentVector& get_extents() const {
+ return extents;
+ }
+ PExtentVector& dirty_extents() {
+ return extents;
+ }
+
+ DENC_HELPERS;
+ void bound_encode(size_t& p, uint64_t struct_v) const {
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc(extents, p);
+ denc_varint(flags, p);
+ denc_varint_lowz(logical_length, p);
+ denc_varint_lowz(compressed_length, p);
+ denc(csum_type, p);
+ denc(csum_chunk_order, p);
+ denc_varint(csum_data.length(), p);
+ p += csum_data.length();
+ p += sizeof(unused_t);
+ }
+
+ void encode(ceph::buffer::list::contiguous_appender& p, uint64_t struct_v) const {
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc(extents, p);
+ denc_varint(flags, p);
+ if (is_compressed()) {
+ denc_varint_lowz(logical_length, p);
+ denc_varint_lowz(compressed_length, p);
+ }
+ if (has_csum()) {
+ denc(csum_type, p);
+ denc(csum_chunk_order, p);
+ denc_varint(csum_data.length(), p);
+ memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
+ csum_data.length());
+ }
+ if (has_unused()) {
+ denc(unused, p);
+ }
+ }
+
+ void decode(ceph::buffer::ptr::const_iterator& p, uint64_t struct_v) {
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc(extents, p);
+ denc_varint(flags, p);
+ if (is_compressed()) {
+ denc_varint_lowz(logical_length, p);
+ denc_varint_lowz(compressed_length, p);
+ } else {
+ logical_length = get_ondisk_length();
+ }
+ if (has_csum()) {
+ denc(csum_type, p);
+ denc(csum_chunk_order, p);
+ int len;
+ denc_varint(len, p);
+ csum_data = p.get_ptr(len);
+ csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
+ if (has_unused()) {
+ denc(unused, p);
+ }
+ }
+
+ bool can_split() const {
+ return
+ !has_flag(FLAG_SHARED) &&
+ !has_flag(FLAG_COMPRESSED) &&
+ !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
+ }
+ bool can_split_at(uint32_t blob_offset) const {
+ return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_blob_t*>& ls);
+
+ bool has_flag(unsigned f) const {
+ return flags & f;
+ }
+ void set_flag(unsigned f) {
+ flags |= f;
+ }
+ void clear_flag(unsigned f) {
+ flags &= ~f;
+ }
+ std::string get_flags_string() const {
+ return get_flags_string(flags);
+ }
+
+ void set_compressed(uint64_t clen_orig, uint64_t clen) {
+ set_flag(FLAG_COMPRESSED);
+ logical_length = clen_orig;
+ compressed_length = clen;
+ }
+ bool is_mutable() const {
+ return !is_compressed() && !is_shared();
+ }
+ bool is_compressed() const {
+ return has_flag(FLAG_COMPRESSED);
+ }
+ bool has_csum() const {
+ return has_flag(FLAG_CSUM);
+ }
+ bool has_unused() const {
+ return has_flag(FLAG_HAS_UNUSED);
+ }
+ bool is_shared() const {
+ return has_flag(FLAG_SHARED);
+ }
+
+ /// return chunk (i.e. min readable block) size for the blob
+ uint64_t get_chunk_size(uint64_t dev_block_size) const {
+ return has_csum() ?
+ std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size;
+ }
+ uint32_t get_csum_chunk_size() const {
+ return 1 << csum_chunk_order;
+ }
+ uint32_t get_compressed_payload_length() const {
+ return is_compressed() ? compressed_length : 0;
+ }
+ uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (x_off >= p->length) {
+ x_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ if (plen)
+ *plen = p->length - x_off;
+ return p->offset + x_off;
+ }
+
+ // validate whether or not the status of pextents within the given range
+ // meets the requirement(allocated or unallocated).
+ bool _validate_range(uint64_t b_off, uint64_t b_len,
+ bool require_allocated) const {
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (b_off >= p->length) {
+ b_off -= p->length;
+ if (++p == extents.end())
+ return false;
+ }
+ b_len += b_off;
+ while (b_len) {
+ if (require_allocated != p->is_valid()) {
+ return false;
+ }
+ if (p->length >= b_len) {
+ return true;
+ }
+ b_len -= p->length;
+ if (++p == extents.end())
+ return false;
+ }
+ ceph_abort_msg("we should not get here");
+ return false;
+ }
+
+ /// return true if the entire range is allocated
+ /// (mapped to extents on disk)
+ bool is_allocated(uint64_t b_off, uint64_t b_len) const {
+ return _validate_range(b_off, b_len, true);
+ }
+
+ /// return true if the entire range is unallocated
+ /// (not mapped to extents on disk)
+ bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
+ return _validate_range(b_off, b_len, false);
+ }
+
+ /// return true if the logical range has never been used
+ bool is_unused(uint64_t offset, uint64_t length) const {
+ if (!has_unused()) {
+ return false;
+ }
+ ceph_assert(!is_compressed());
+ uint64_t blob_len = get_logical_length();
+ ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+ ceph_assert(offset + length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+ uint64_t start = offset / chunk_size;
+ uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+ auto i = start;
+ while (i < end && (unused & (1u << i))) {
+ i++;
+ }
+ return i >= end;
+ }
+
+ /// mark a range that has never been used
+ void add_unused(uint64_t offset, uint64_t length) {
+ ceph_assert(!is_compressed());
+ uint64_t blob_len = get_logical_length();
+ ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+ ceph_assert(offset + length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+ uint64_t start = round_up_to(offset, chunk_size) / chunk_size;
+ uint64_t end = (offset + length) / chunk_size;
+ for (auto i = start; i < end; ++i) {
+ unused |= (1u << i);
+ }
+ if (start != end) {
+ set_flag(FLAG_HAS_UNUSED);
+ }
+ }
+
+ /// indicate that a range has (now) been used.
+ void mark_used(uint64_t offset, uint64_t length) {
+ if (has_unused()) {
+ ceph_assert(!is_compressed());
+ uint64_t blob_len = get_logical_length();
+ ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+ ceph_assert(offset + length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+ uint64_t start = offset / chunk_size;
+ uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+ for (auto i = start; i < end; ++i) {
+ unused &= ~(1u << i);
+ }
+ if (unused == 0) {
+ clear_flag(FLAG_HAS_UNUSED);
+ }
+ }
+ }
+
+ // map_f_invoke templates intended to mask parameters which are not expected
+ // by the provided callback
+ template<class F, typename std::enable_if<std::is_invocable_r_v<
+ int,
+ F,
+ uint64_t,
+ uint64_t>>::type* = nullptr>
+ int map_f_invoke(uint64_t lo,
+ const bluestore_pextent_t& p,
+ uint64_t o,
+ uint64_t l, F&& f) const{
+ return f(o, l);
+ }
+
+ template<class F, typename std::enable_if<std::is_invocable_r_v<
+ int,
+ F,
+ uint64_t,
+ uint64_t,
+ uint64_t>>::type * = nullptr>
+ int map_f_invoke(uint64_t lo,
+ const bluestore_pextent_t& p,
+ uint64_t o,
+ uint64_t l, F&& f) const {
+ return f(lo, o, l);
+ }
+
+ template<class F, typename std::enable_if<std::is_invocable_r_v<
+ int,
+ F,
+ const bluestore_pextent_t&,
+ uint64_t,
+ uint64_t>>::type * = nullptr>
+ int map_f_invoke(uint64_t lo,
+ const bluestore_pextent_t& p,
+ uint64_t o,
+ uint64_t l, F&& f) const {
+ return f(p, o, l);
+ }
+
+ template<class F>
+ int map(uint64_t x_off, uint64_t x_len, F&& f) const {
+ auto x_off0 = x_off;
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (x_off >= p->length) {
+ x_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ while (x_len > 0 && p != extents.end()) {
+ uint64_t l = std::min(p->length - x_off, x_len);
+ int r = map_f_invoke(x_off0, *p, p->offset + x_off, l, f);
+ if (r < 0)
+ return r;
+ x_off = 0;
+ x_len -= l;
+ x_off0 += l;
+ ++p;
+ }
+ return 0;
+ }
+
+ template<class F>
+ void map_bl(uint64_t x_off,
+ ceph::buffer::list& bl,
+ F&& f) const {
+ static_assert(std::is_invocable_v<F, uint64_t, ceph::buffer::list&>);
+
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (x_off >= p->length) {
+ x_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ ceph::buffer::list::iterator it = bl.begin();
+ uint64_t x_len = bl.length();
+ while (x_len > 0) {
+ ceph_assert(p != extents.end());
+ uint64_t l = std::min(p->length - x_off, x_len);
+ ceph::buffer::list t;
+ it.copy(l, t);
+ f(p->offset + x_off, t);
+ x_off = 0;
+ x_len -= l;
+ ++p;
+ }
+ }
+
+ uint32_t get_ondisk_length() const {
+ uint32_t len = 0;
+ for (auto &p : extents) {
+ len += p.length;
+ }
+ return len;
+ }
+
+ uint32_t get_logical_length() const {
+ return logical_length;
+ }
+ size_t get_csum_value_size() const;
+
+ size_t get_csum_count() const {
+ size_t vs = get_csum_value_size();
+ if (!vs)
+ return 0;
+ return csum_data.length() / vs;
+ }
+ uint64_t get_csum_item(unsigned i) const {
+ size_t cs = get_csum_value_size();
+ const char *p = csum_data.c_str();
+ switch (cs) {
+ case 0:
+ ceph_abort_msg("no csum data, bad index");
+ case 1:
+ return reinterpret_cast<const uint8_t*>(p)[i];
+ case 2:
+ return reinterpret_cast<const ceph_le16*>(p)[i];
+ case 4:
+ return reinterpret_cast<const ceph_le32*>(p)[i];
+ case 8:
+ return reinterpret_cast<const ceph_le64*>(p)[i];
+ default:
+ ceph_abort_msg("unrecognized csum word size");
+ }
+ }
+ const char *get_csum_item_ptr(unsigned i) const {
+ size_t cs = get_csum_value_size();
+ return csum_data.c_str() + (cs * i);
+ }
+ char *get_csum_item_ptr(unsigned i) {
+ size_t cs = get_csum_value_size();
+ return csum_data.c_str() + (cs * i);
+ }
+
+ void init_csum(unsigned type, unsigned order, unsigned len) {
+ flags |= FLAG_CSUM;
+ csum_type = type;
+ csum_chunk_order = order;
+ csum_data = ceph::buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
+ csum_data.zero();
+ csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
+
+ /// calculate csum for the buffer at the given b_off
+ void calc_csum(uint64_t b_off, const ceph::buffer::list& bl);
+
+ /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
+ /// return -1 and valid(nonnegative) b_bad_off for checksum error;
+ /// return 0 if all is well.
+ int verify_csum(uint64_t b_off, const ceph::buffer::list& bl, int* b_bad_off,
+ uint64_t *bad_csum) const;
+
+ bool can_prune_tail() const {
+ return
+ extents.size() > 1 && // if it's all invalid it's not pruning.
+ !extents.back().is_valid() &&
+ !has_unused();
+ }
+ void prune_tail() {
+ const auto &p = extents.back();
+ logical_length -= p.length;
+ extents.pop_back();
+ if (has_csum()) {
+ ceph::buffer::ptr t;
+ t.swap(csum_data);
+ csum_data = ceph::buffer::ptr(t.c_str(),
+ get_logical_length() / get_csum_chunk_size() *
+ get_csum_value_size());
+ }
+ }
+ void add_tail(uint32_t new_len) {
+ ceph_assert(is_mutable());
+ ceph_assert(!has_unused());
+ ceph_assert(new_len > logical_length);
+ extents.emplace_back(
+ bluestore_pextent_t(
+ bluestore_pextent_t::INVALID_OFFSET,
+ new_len - logical_length));
+ logical_length = new_len;
+ if (has_csum()) {
+ ceph::buffer::ptr t;
+ t.swap(csum_data);
+ csum_data = ceph::buffer::create(
+ get_csum_value_size() * logical_length / get_csum_chunk_size());
+ csum_data.copy_in(0, t.length(), t.c_str());
+ csum_data.zero(t.length(), csum_data.length() - t.length());
+ }
+ }
+ uint32_t get_release_size(uint32_t min_alloc_size) const {
+ if (is_compressed()) {
+ return get_logical_length();
+ }
+ uint32_t res = get_csum_chunk_size();
+ if (!has_csum() || res < min_alloc_size) {
+ res = min_alloc_size;
+ }
+ return res;
+ }
+
+ void split(uint32_t blob_offset, bluestore_blob_t& rb);
+ void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
+ void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
+
+ /// updates blob's pextents container and return unused pextents eligible
+ /// for release.
+ /// all - indicates that the whole blob to be released.
+ /// logical - specifies set of logical extents within blob's
+ /// to be released
+ /// Returns true if blob has no more valid pextents
+ bool release_extents(
+ bool all,
+ const PExtentVector& logical,
+ PExtentVector* r);
+};
+WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_blob_t& o);
+
+
+/// shared blob state
+struct bluestore_shared_blob_t {
+ MEMPOOL_CLASS_HELPERS();
+ uint64_t sbid; ///> shared blob id
+ bluestore_extent_ref_map_t ref_map; ///< shared blob extents
+
+ bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
+ bluestore_shared_blob_t(uint64_t _sbid,
+ bluestore_extent_ref_map_t&& _ref_map )
+ : sbid(_sbid), ref_map(std::move(_ref_map)) {}
+
+ DENC(bluestore_shared_blob_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.ref_map, p);
+ DENC_FINISH(p);
+ }
+
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_shared_blob_t*>& ls);
+
+ bool empty() const {
+ return ref_map.empty();
+ }
+};
+WRITE_CLASS_DENC(bluestore_shared_blob_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_shared_blob_t& o);
+
+/// onode: per-object metadata
+struct bluestore_onode_t {
+ uint64_t nid = 0; ///< numeric id (locally unique)
+ uint64_t size = 0; ///< object size
+ // mempool to be assigned to buffer::ptr manually
+ std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs;
+
+ struct shard_info {
+ uint32_t offset = 0; ///< logical offset for start of shard
+ uint32_t bytes = 0; ///< encoded bytes
+ DENC(shard_info, v, p) {
+ denc_varint(v.offset, p);
+ denc_varint(v.bytes, p);
+ }
+ void dump(ceph::Formatter *f) const;
+ };
+ std::vector<shard_info> extent_map_shards; ///< extent std::map shards (if any)
+
+ uint32_t expected_object_size = 0;
+ uint32_t expected_write_size = 0;
+ uint32_t alloc_hint_flags = 0;
+
+ uint8_t flags = 0;
+
+ enum {
+ FLAG_OMAP = 1, ///< object may have omap data
+ FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix
+ FLAG_PERPOOL_OMAP = 4, ///< omap data is in per-pool prefix; per-pool keys
+ FLAG_PERPG_OMAP = 8, ///< omap data is in per-pg prefix; per-pg keys
+ };
+
+ std::string get_flags_string() const {
+ std::string s;
+ if (flags & FLAG_OMAP) {
+ s = "omap";
+ }
+ if (flags & FLAG_PGMETA_OMAP) {
+ s += "+pgmeta_omap";
+ }
+ if (flags & FLAG_PERPOOL_OMAP) {
+ s += "+per_pool_omap";
+ }
+ if (flags & FLAG_PERPG_OMAP) {
+ s += "+per_pg_omap";
+ }
+ return s;
+ }
+
+ bool has_flag(unsigned f) const {
+ return flags & f;
+ }
+
+ void set_flag(unsigned f) {
+ flags |= f;
+ }
+
+ void clear_flag(unsigned f) {
+ flags &= ~f;
+ }
+
+ bool has_omap() const {
+ return has_flag(FLAG_OMAP);
+ }
+
+ static bool is_pgmeta_omap(uint8_t flags) {
+ return flags & FLAG_PGMETA_OMAP;
+ }
+ static bool is_perpool_omap(uint8_t flags) {
+ return flags & FLAG_PERPOOL_OMAP;
+ }
+ static bool is_perpg_omap(uint8_t flags) {
+ return flags & FLAG_PERPG_OMAP;
+ }
+ bool is_pgmeta_omap() const {
+ return has_flag(FLAG_PGMETA_OMAP);
+ }
+ bool is_perpool_omap() const {
+ return has_flag(FLAG_PERPOOL_OMAP);
+ }
+ bool is_perpg_omap() const {
+ return has_flag(FLAG_PERPG_OMAP);
+ }
+
+ void set_omap_flags(bool legacy) {
+ set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP)));
+ }
+ void set_omap_flags_pgmeta() {
+ set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP);
+ }
+
+ void clear_omap_flag() {
+ clear_flag(FLAG_OMAP);
+ }
+
+ DENC(bluestore_onode_t, v, p) {
+ DENC_START(1, 1, p);
+ denc_varint(v.nid, p);
+ denc_varint(v.size, p);
+ denc(v.attrs, p);
+ denc(v.flags, p);
+ denc(v.extent_map_shards, p);
+ denc_varint(v.expected_object_size, p);
+ denc_varint(v.expected_write_size, p);
+ denc_varint(v.alloc_hint_flags, p);
+ DENC_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_onode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
+WRITE_CLASS_DENC(bluestore_onode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);
+
+/// writeahead-logged op
+struct bluestore_deferred_op_t {
+ typedef enum {
+ OP_WRITE = 1,
+ } type_t;
+ __u8 op = 0;
+
+ PExtentVector extents;
+ ceph::buffer::list data;
+
+ DENC(bluestore_deferred_op_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.extents, p);
+ denc(v.data, p);
+ DENC_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_deferred_op_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_op_t)
+
+
+/// writeahead-logged transaction
+struct bluestore_deferred_transaction_t {
+ uint64_t seq = 0;
+ std::list<bluestore_deferred_op_t> ops;
+ interval_set<uint64_t> released; ///< allocations to release after tx
+
+ bluestore_deferred_transaction_t() : seq(0) {}
+
+ DENC(bluestore_deferred_transaction_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.seq, p);
+ denc(v.ops, p);
+ denc(v.released, p);
+ DENC_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_deferred_transaction_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
+
+struct bluestore_compression_header_t {
+ uint8_t type = Compressor::COMP_ALG_NONE;
+ uint32_t length = 0;
+ boost::optional<int32_t> compressor_message;
+
+ bluestore_compression_header_t() {}
+ bluestore_compression_header_t(uint8_t _type)
+ : type(_type) {}
+
+ DENC(bluestore_compression_header_t, v, p) {
+ DENC_START(2, 1, p);
+ denc(v.type, p);
+ denc(v.length, p);
+ if (struct_v >= 2) {
+ denc(v.compressor_message, p);
+ }
+ DENC_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<bluestore_compression_header_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_compression_header_t)
+
+template <template <typename> typename V, class COUNTER_TYPE = int32_t>
+class ref_counter_2hash_tracker_t {
+ size_t num_non_zero = 0;
+ size_t num_buckets = 0;
+ V<COUNTER_TYPE> buckets1;
+ V<COUNTER_TYPE> buckets2;
+
+public:
+ ref_counter_2hash_tracker_t(uint64_t mem_cap) {
+ num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2;
+ ceph_assert(num_buckets);
+ buckets1.resize(num_buckets);
+ buckets2.resize(num_buckets);
+ reset();
+ }
+
+ size_t get_num_buckets() const {
+ return num_buckets;
+ }
+
+ void inc(const char* hash_val, size_t hash_val_len, int n) {
+ auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) %
+ num_buckets;
+ if (buckets1[h] == 0 && n) {
+ ++num_non_zero;
+ } else if (buckets1[h] == -n) {
+ --num_non_zero;
+ }
+ buckets1[h] += n;
+ h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets;
+ if (buckets2[h] == 0 && n) {
+ ++num_non_zero;
+ } else if (buckets2[h] == -n) {
+ --num_non_zero;
+ }
+ buckets2[h] += n;
+ }
+
+ bool test_hash_conflict(
+ const char* hash_val1,
+ const char* hash_val2,
+ size_t hash_val_len) const {
+
+ auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len);
+ auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len);
+ auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len);
+ auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len);
+ return ((h1 % num_buckets) == (h2 % num_buckets)) &&
+ ((h3 % num_buckets) == (h4 % num_buckets));
+ }
+
+ bool test_all_zero(const char* hash_val, size_t hash_val_len) const {
+ auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len);
+ if (buckets1[h % num_buckets] != 0) {
+ return false;
+ }
+ h = ceph_str_hash_linux((const char*)hash_val, hash_val_len);
+ return buckets2[h % num_buckets] == 0;
+ }
+
+ // returns number of mismatching buckets
+ size_t count_non_zero() const {
+ return num_non_zero;
+ }
+ void reset() {
+ for (size_t i = 0; i < num_buckets; i++) {
+ buckets1[i] = 0;
+ buckets2[i] = 0;
+ }
+ num_non_zero = 0;
+ }
+};
+
+class shared_blob_2hash_tracker_t
+ : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
+
+ static const size_t hash_input_len = 3;
+
+ typedef std::array<uint64_t, hash_input_len> hash_input_t;
+
+ static size_t get_hash_input_size() {
+ return hash_input_len * sizeof(hash_input_t::value_type);
+ }
+
+ inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
+
+ size_t au_void_bits = 0;
+
+
+public:
+ shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
+ : ref_counter_2hash_tracker_t(mem_cap) {
+ ceph_assert(alloc_unit);
+ ceph_assert(isp2(alloc_unit));
+ au_void_bits = ctz(alloc_unit);
+ }
+ void inc(uint64_t sbid, uint64_t offset, int n);
+ void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n);
+
+ bool test_hash_conflict(
+ uint64_t sbid,
+ uint64_t offset,
+ uint64_t sbid2,
+ uint64_t offset2) const;
+ bool test_all_zero(
+ uint64_t sbid,
+ uint64_t offset) const;
+ bool test_all_zero_range(
+ uint64_t sbid,
+ uint64_t offset,
+ uint32_t len) const;
+};
+
+class sb_info_t {
+ // subzero value indicates (potentially) stray blob,
+ // i.e. blob that has got no real references from onodes
+ int64_t sbid = 0;
+
+public:
+ enum {
+ INVALID_POOL_ID = INT64_MIN
+ };
+
+ int64_t pool_id = INVALID_POOL_ID;
+ // subzero value indicates compressed_allocated as well
+ int32_t allocated_chunks = 0;
+
+ sb_info_t(int64_t _sbid = 0) : sbid(_sbid)
+ {
+ }
+ bool operator< (const sb_info_t& other) const {
+ return std::abs(sbid) < std::abs(other.sbid);
+ }
+ bool operator< (const uint64_t& other_sbid) const {
+ return uint64_t(std::abs(sbid)) < other_sbid;
+ }
+ bool is_stray() const {
+ return sbid < 0;
+ }
+ uint64_t get_sbid() const {
+ return uint64_t(std::abs(sbid));
+ }
+ void adopt() {
+ sbid = std::abs(sbid);
+ }
+} __attribute__((packed));
+
+// Space-efficient container to keep a set of sb_info structures
+// given that the majority of entries are appended in a proper id-sorted
+// order. Hence one can keep them in a regular vector and apply binary search
+// whenever specific entry to be found.
+// For the rare occasions when out-of-order append takes place - an auxilliary
+// regular map is used.
+struct sb_info_space_efficient_map_t {
+ // large array sorted by the user
+ mempool::bluestore_fsck::vector<sb_info_t> items;
+ // small additional set of items we maintain sorting ourselves
+ // this would never keep an entry with id > items.back().id
+ mempool::bluestore_fsck::vector<sb_info_t> aux_items;
+
+ sb_info_t& add_maybe_stray(uint64_t sbid) {
+ return _add(-int64_t(sbid));
+ }
+ sb_info_t& add_or_adopt(uint64_t sbid) {
+ auto& r = _add(sbid);
+ r.adopt();
+ return r;
+ }
+ auto find(uint64_t id) {
+ if (items.size() != 0) {
+ auto it = std::lower_bound(
+ items.begin(),
+ items.end() - 1,
+ id,
+ [](const sb_info_t& a, const uint64_t& b) {
+ return a < b;
+ });
+ if (it->get_sbid() == id) {
+ return it;
+ }
+ if (aux_items.size() != 0) {
+ auto it = std::lower_bound(
+ aux_items.begin(),
+ aux_items.end(),
+ id,
+ [](const sb_info_t& a, const uint64_t& b) {
+ return a < b;
+ });
+ if (it->get_sbid() == id) {
+ return it;
+ }
+ }
+ }
+ return items.end();
+ }
+ // enumerates strays, order isn't guaranteed.
+ void foreach_stray(std::function<void(const sb_info_t&)> cb) {
+ for (auto& sbi : items) {
+ if (sbi.is_stray()) {
+ cb(sbi);
+ }
+ }
+ for (auto& sbi : aux_items) {
+ if (sbi.is_stray()) {
+ cb(sbi);
+ }
+ }
+ }
+ auto end() {
+ return items.end();
+ }
+
+ void shrink() {
+ items.shrink_to_fit();
+ aux_items.shrink_to_fit();
+ }
+ void clear() {
+ items.clear();
+ aux_items.clear();
+ shrink();
+ }
+private:
+ sb_info_t& _add(int64_t id) {
+ uint64_t n_id = uint64_t(std::abs(id));
+ if (items.size() == 0 || n_id > items.back().get_sbid()) {
+ return items.emplace_back(id);
+ }
+ auto it = find(n_id);
+ if (it != items.end()) {
+ return *it;
+ }
+ if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
+ return aux_items.emplace_back(id);
+ }
+ // do sorted insertion, may be expensive!
+ it = std::upper_bound(
+ aux_items.begin(),
+ aux_items.end(),
+ n_id,
+ [](const uint64_t& a, const sb_info_t& b) {
+ return a < b.get_sbid();
+ });
+ return *aux_items.emplace(it, id);
+ }
+};
+
+#endif