From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/include/frag.h | 615 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 615 insertions(+) create mode 100644 src/include/frag.h (limited to 'src/include/frag.h') diff --git a/src/include/frag.h b/src/include/frag.h new file mode 100644 index 000000000..ec18bddfb --- /dev/null +++ b/src/include/frag.h @@ -0,0 +1,615 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_FRAG_H +#define CEPH_FRAG_H + +#include + +#include + +#include +#include + +#include "buffer.h" +#include "compact_map.h" + +#include "ceph_frag.h" +#include "include/encoding.h" +#include "include/ceph_assert.h" + +#include "common/dout.h" + +/* + * + * the goal here is to use a binary split strategy to partition a namespace. + * frag_t represents a particular fragment. bits() tells you the size of the + * fragment, and value() it's name. this is roughly analogous to an ip address + * and netmask. + * + * fragtree_t represents an entire namespace and it's partition. it essentially + * tells you where fragments are split into other fragments, and by how much + * (i.e. by how many bits, resulting in a power of 2 number of child fragments). + * + * this vaguely resembles a btree, in that when a fragment becomes large or small + * we can split or merge, except that there is no guarantee of being balanced. + * + * presumably we are partitioning the output of a (perhaps specialized) hash + * function. + */ + +/** + * frag_t + * + * description of an individual fragment. that is, a particular piece + * of the overall namespace. + * + * this is conceptually analogous to an ip address and netmask. + * + * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). + * + * we write it as v/b, where v is a value and b is the number of bits. + * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, + * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. + * + * this makes the right most bit of v the "most significant", which is the + * opposite of what we usually see. + */ + +/* + * TODO: + * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) + * iteration efficient (see, e.g., try_assimilate_children() + * - rework frag_t so that we mask the left-most (most significant) bits instead of + * the right-most (least significant) bits. just because it's more intuitive, and + * matches the network/netmask concept. + */ + +class frag_t { + /* + * encoding is dictated by frag_* functions in ceph_fs.h. use those + * helpers _exclusively_. + */ +public: + using _frag_t = uint32_t; + + frag_t() = default; + frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { } + frag_t(_frag_t e) : _enc(e) { } + + // constructors + void from_unsigned(unsigned e) { _enc = e; } + + // accessors + unsigned value() const { return ceph_frag_value(_enc); } + unsigned bits() const { return ceph_frag_bits(_enc); } + unsigned mask() const { return ceph_frag_mask(_enc); } + unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); } + + operator _frag_t() const { return _enc; } + + // tests + bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); } + bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); } + bool is_root() const { return bits() == 0; } + frag_t parent() const { + ceph_assert(bits() > 0); + return frag_t(ceph_frag_parent(_enc)); + } + + // splitting + frag_t make_child(int i, int nb) const { + ceph_assert(i < (1< + void split(int nb, T& fragments) const { + ceph_assert(nb > 0); + unsigned nway = 1 << nb; + for (unsigned i=0; i; + +/** + * fragtree_t -- partition an entire namespace into one or more frag_t's. + */ +class fragtree_t { + // pairs : + // frag_t f is split by b bits. + // if child frag_t does not appear, it is not split. +public: + compact_map _splits; + +public: + // ------------- + // basics + void swap(fragtree_t& other) { + _splits.swap(other._splits); + } + void clear() { + _splits.clear(); + } + + // ------------- + // accessors + bool empty() const { + return _splits.empty(); + } + int get_split(const frag_t hb) const { + compact_map::const_iterator p = _splits.find(hb); + if (p == _splits.end()) + return 0; + else + return p->second; + } + + + bool is_leaf(frag_t x) const { + frag_vec_t s; + get_leaves_under(x, s); + //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl; + return s.size() == 1 && s.front() == x; + } + + /** + * get_leaves -- list all leaves + */ + template + void get_leaves(T& c) const { + return get_leaves_under_split(frag_t(), c); + } + + /** + * get_leaves_under_split -- list all leaves under a known split point (or root) + */ + template + void get_leaves_under_split(frag_t under, T& c) const { + frag_vec_t s; + s.push_back(under); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + int nb = get_split(t); + if (nb) + t.split(nb, s); // queue up children + else + c.push_back(t); // not spit, it's a leaf. + } + } + + /** + * get_branch -- get branch point at OR above frag @a x + * - may be @a x itself, if @a x is a split + * - may be root (frag_t()) + */ + frag_t get_branch(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + if (get_split(x)) return x; // found it! + x = x.parent(); + } + } + + /** + * get_branch_above -- get a branch point above frag @a x + * - may be root (frag_t()) + * - may NOT be @a x, even if @a x is a split. + */ + frag_t get_branch_above(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + x = x.parent(); + if (get_split(x)) return x; // found it! + } + } + + + /** + * get_branch_or_leaf -- get branch or leaf point parent for frag @a x + * - may be @a x itself, if @a x is a split or leaf + * - may be root (frag_t()) + */ + frag_t get_branch_or_leaf(frag_t x) const { + frag_t branch = get_branch(x); + int nb = get_split(branch); + if (nb > 0 && // if branch is a split, and + branch.bits() + nb <= x.bits()) // one of the children is or contains x + return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) + else + return branch; + } + + /** + * get_leaves_under(x, ls) -- search for any leaves fully contained by x + */ + template + void get_leaves_under(frag_t x, T& c) const { + frag_vec_t s; + s.push_back(get_branch_or_leaf(x)); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) + t.split(nb, s); // queue up children + else if (x.contains(t)) + c.push_back(t); // not spit, it's a leaf. + } + } + + /** + * contains(fg) -- does fragtree contain the specific frag @a x + */ + bool contains(frag_t x) const { + frag_vec_t s; + s.push_back(get_branch(x)); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) { + if (t == x) return false; // it's split. + t.split(nb, s); // queue up children + } else { + if (t == x) return true; // it's there. + } + } + return false; + } + + /** + * operator[] -- map a (hash?) value to a frag + */ + frag_t operator[](unsigned v) const { + frag_t t; + while (1) { + ceph_assert(t.contains(v)); + int nb = get_split(t); + + // is this a leaf? + if (nb == 0) return t; // done. + + // pick appropriate child fragment. + unsigned nway = 1 << nb; + unsigned i; + for (i=0; i spread); + + // add an intermediary split + merge(parent, nb, false); + split(parent, spread, false); + + frag_vec_t subs; + parent.split(spread, subs); + for (auto& frag : subs) { + lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl; + split(frag, nb - spread, false); + } + } + + // x is now a leaf or split. + // hoover up any children. + frag_vec_t s; + s.push_back(x); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + int nb = get_split(t); + if (nb) { + lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl; + merge(t, nb, false); // merge this point, and + t.split(nb, s); // queue up children + } + } + + lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl; + ceph_assert(is_leaf(x)); + return true; + } + + // encoding + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(_splits, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(_splits, p); + } + void encode_nohead(ceph::buffer::list& bl) const { + using ceph::encode; + for (compact_map::const_iterator p = _splits.begin(); + p != _splits.end(); + ++p) { + encode(p->first, bl); + encode(p->second, bl); + } + } + void decode_nohead(int n, ceph::buffer::list::const_iterator& p) { + using ceph::decode; + _splits.clear(); + while (n-- > 0) { + frag_t f; + decode(f, p); + decode(_splits[f], p); + } + } + + void print(std::ostream& out) { + out << "fragtree_t("; + frag_vec_t s; + s.push_back(frag_t()); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + // newline + indent? + if (t.bits()) { + out << std::endl; + for (unsigned i=0; iopen_array_section("splits"); + for (auto p = _splits.begin(); p != _splits.end(); ++p) { + f->open_object_section("split"); + std::ostringstream frag_str; + frag_str << p->first; + f->dump_string("frag", frag_str.str()); + f->dump_int("children", p->second); + f->close_section(); // split + } + f->close_section(); // splits + } +}; +WRITE_CLASS_ENCODER(fragtree_t) + +inline bool operator==(const fragtree_t& l, const fragtree_t& r) { + return l._splits == r._splits; +} +inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { + return l._splits != r._splits; +} + +inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) +{ + out << "fragtree_t("; + + for (compact_map::const_iterator p = ft._splits.begin(); + p != ft._splits.end(); + ++p) { + if (p != ft._splits.begin()) + out << " "; + out << p->first << "^" << p->second; + } + return out << ")"; +} + +/** + * fragset_t -- a set of fragments + */ +class fragset_t { + std::set _set; + +public: + const std::set &get() const { return _set; } + std::set::const_iterator begin() const { return _set.begin(); } + std::set::const_iterator end() const { return _set.end(); } + + bool empty() const { return _set.empty(); } + + bool contains(frag_t f) const { + while (1) { + if (_set.count(f)) return true; + if (f.bits() == 0) return false; + f = f.parent(); + } + } + + void clear() { + _set.clear(); + } + + void insert_raw(frag_t f){ + _set.insert(f); + } + void insert(frag_t f) { + _set.insert(f); + simplify(); + } + + void simplify() { + auto it = _set.begin(); + while (it != _set.end()) { + if (!it->is_root() && + _set.count(it->get_sibling())) { + _set.erase(it->get_sibling()); + auto ret = _set.insert(it->parent()); + _set.erase(it); + it = ret.first; + } else { + ++it; + } + } + } + + void encode(ceph::buffer::list& bl) const { + ceph::encode(_set, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + ceph::decode(_set, p); + } +}; +WRITE_CLASS_ENCODER(fragset_t) + + +inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) +{ + return out << "fragset_t(" << fs.get() << ")"; +} + +#endif -- cgit v1.2.3