// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef CEPH_FRAG_H #define CEPH_FRAG_H #include #include #include #include #include "buffer.h" #include "compact_map.h" #include "ceph_frag.h" #include "include/encoding.h" #include "include/ceph_assert.h" #include "common/dout.h" /* * * the goal here is to use a binary split strategy to partition a namespace. * frag_t represents a particular fragment. bits() tells you the size of the * fragment, and value() it's name. this is roughly analogous to an ip address * and netmask. * * fragtree_t represents an entire namespace and it's partition. it essentially * tells you where fragments are split into other fragments, and by how much * (i.e. by how many bits, resulting in a power of 2 number of child fragments). * * this vaguely resembles a btree, in that when a fragment becomes large or small * we can split or merge, except that there is no guarantee of being balanced. * * presumably we are partitioning the output of a (perhaps specialized) hash * function. */ /** * frag_t * * description of an individual fragment. that is, a particular piece * of the overall namespace. * * this is conceptually analogous to an ip address and netmask. * * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). * * we write it as v/b, where v is a value and b is the number of bits. * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. * * this makes the right most bit of v the "most significant", which is the * opposite of what we usually see. */ /* * TODO: * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) * iteration efficient (see, e.g., try_assimilate_children() * - rework frag_t so that we mask the left-most (most significant) bits instead of * the right-most (least significant) bits. just because it's more intuitive, and * matches the network/netmask concept. */ class frag_t { /* * encoding is dictated by frag_* functions in ceph_fs.h. use those * helpers _exclusively_. */ public: using _frag_t = uint32_t; frag_t() = default; frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { } frag_t(_frag_t e) : _enc(e) { } // constructors void from_unsigned(unsigned e) { _enc = e; } // accessors unsigned value() const { return ceph_frag_value(_enc); } unsigned bits() const { return ceph_frag_bits(_enc); } unsigned mask() const { return ceph_frag_mask(_enc); } unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); } operator _frag_t() const { return _enc; } // tests bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); } bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); } bool is_root() const { return bits() == 0; } frag_t parent() const { ceph_assert(bits() > 0); return frag_t(ceph_frag_parent(_enc)); } // splitting frag_t make_child(int i, int nb) const { ceph_assert(i < (1< void split(int nb, T& fragments) const { ceph_assert(nb > 0); unsigned nway = 1 << nb; for (unsigned i=0; i; /** * fragtree_t -- partition an entire namespace into one or more frag_t's. */ class fragtree_t { // pairs : // frag_t f is split by b bits. // if child frag_t does not appear, it is not split. public: compact_map _splits; public: // ------------- // basics void swap(fragtree_t& other) { _splits.swap(other._splits); } void clear() { _splits.clear(); } // ------------- // accessors bool empty() const { return _splits.empty(); } int get_split(const frag_t hb) const { compact_map::const_iterator p = _splits.find(hb); if (p == _splits.end()) return 0; else return p->second; } bool is_leaf(frag_t x) const { frag_vec_t s; get_leaves_under(x, s); //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl; return s.size() == 1 && s.front() == x; } /** * get_leaves -- list all leaves */ template void get_leaves(T& c) const { return get_leaves_under_split(frag_t(), c); } /** * get_leaves_under_split -- list all leaves under a known split point (or root) */ template void get_leaves_under_split(frag_t under, T& c) const { frag_vec_t s; s.push_back(under); while (!s.empty()) { frag_t t = s.back(); s.pop_back(); int nb = get_split(t); if (nb) t.split(nb, s); // queue up children else c.push_back(t); // not spit, it's a leaf. } } /** * get_branch -- get branch point at OR above frag @a x * - may be @a x itself, if @a x is a split * - may be root (frag_t()) */ frag_t get_branch(frag_t x) const { while (1) { if (x == frag_t()) return x; // root if (get_split(x)) return x; // found it! x = x.parent(); } } /** * get_branch_above -- get a branch point above frag @a x * - may be root (frag_t()) * - may NOT be @a x, even if @a x is a split. */ frag_t get_branch_above(frag_t x) const { while (1) { if (x == frag_t()) return x; // root x = x.parent(); if (get_split(x)) return x; // found it! } } /** * get_branch_or_leaf -- get branch or leaf point parent for frag @a x * - may be @a x itself, if @a x is a split or leaf * - may be root (frag_t()) */ frag_t get_branch_or_leaf(frag_t x) const { frag_t branch = get_branch(x); int nb = get_split(branch); if (nb > 0 && // if branch is a split, and branch.bits() + nb <= x.bits()) // one of the children is or contains x return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) else return branch; } /** * get_leaves_under(x, ls) -- search for any leaves fully contained by x */ template void get_leaves_under(frag_t x, T& c) const { frag_vec_t s; s.push_back(get_branch_or_leaf(x)); while (!s.empty()) { frag_t t = s.back(); s.pop_back(); if (t.bits() >= x.bits() && // if t is more specific than x, and !x.contains(t)) // x does not contain t, continue; // then skip int nb = get_split(t); if (nb) t.split(nb, s); // queue up children else if (x.contains(t)) c.push_back(t); // not spit, it's a leaf. } } /** * contains(fg) -- does fragtree contain the specific frag @a x */ bool contains(frag_t x) const { frag_vec_t s; s.push_back(get_branch(x)); while (!s.empty()) { frag_t t = s.back(); s.pop_back(); if (t.bits() >= x.bits() && // if t is more specific than x, and !x.contains(t)) // x does not contain t, continue; // then skip int nb = get_split(t); if (nb) { if (t == x) return false; // it's split. t.split(nb, s); // queue up children } else { if (t == x) return true; // it's there. } } return false; } /** * operator[] -- map a (hash?) value to a frag */ frag_t operator[](unsigned v) const { frag_t t; while (1) { ceph_assert(t.contains(v)); int nb = get_split(t); // is this a leaf? if (nb == 0) return t; // done. // pick appropriate child fragment. unsigned nway = 1 << nb; unsigned i; for (i=0; i spread); // add an intermediary split merge(parent, nb, false); split(parent, spread, false); frag_vec_t subs; parent.split(spread, subs); for (auto& frag : subs) { lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl; split(frag, nb - spread, false); } } // x is now a leaf or split. // hoover up any children. frag_vec_t s; s.push_back(x); while (!s.empty()) { frag_t t = s.back(); s.pop_back(); int nb = get_split(t); if (nb) { lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl; merge(t, nb, false); // merge this point, and t.split(nb, s); // queue up children } } lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl; ceph_assert(is_leaf(x)); return true; } // encoding void encode(ceph::buffer::list& bl) const { using ceph::encode; encode(_splits, bl); } void decode(ceph::buffer::list::const_iterator& p) { using ceph::decode; decode(_splits, p); } void encode_nohead(ceph::buffer::list& bl) const { using ceph::encode; for (compact_map::const_iterator p = _splits.begin(); p != _splits.end(); ++p) { encode(p->first, bl); encode(p->second, bl); } } void decode_nohead(int n, ceph::buffer::list::const_iterator& p) { using ceph::decode; _splits.clear(); while (n-- > 0) { frag_t f; decode(f, p); decode(_splits[f], p); } } void print(std::ostream& out) { out << "fragtree_t("; frag_vec_t s; s.push_back(frag_t()); while (!s.empty()) { frag_t t = s.back(); s.pop_back(); // newline + indent? if (t.bits()) { out << std::endl; for (unsigned i=0; iopen_array_section("splits"); for (auto p = _splits.begin(); p != _splits.end(); ++p) { f->open_object_section("split"); std::ostringstream frag_str; frag_str << p->first; f->dump_string("frag", frag_str.str()); f->dump_int("children", p->second); f->close_section(); // split } f->close_section(); // splits } }; WRITE_CLASS_ENCODER(fragtree_t) inline bool operator==(const fragtree_t& l, const fragtree_t& r) { return l._splits == r._splits; } inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { return l._splits != r._splits; } inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) { out << "fragtree_t("; for (compact_map::const_iterator p = ft._splits.begin(); p != ft._splits.end(); ++p) { if (p != ft._splits.begin()) out << " "; out << p->first << "^" << p->second; } return out << ")"; } /** * fragset_t -- a set of fragments */ class fragset_t { std::set _set; public: const std::set &get() const { return _set; } std::set::const_iterator begin() const { return _set.begin(); } std::set::const_iterator end() const { return _set.end(); } bool empty() const { return _set.empty(); } bool contains(frag_t f) const { while (1) { if (_set.count(f)) return true; if (f.bits() == 0) return false; f = f.parent(); } } void clear() { _set.clear(); } void insert_raw(frag_t f){ _set.insert(f); } void insert(frag_t f) { _set.insert(f); simplify(); } void simplify() { auto it = _set.begin(); while (it != _set.end()) { if (!it->is_root() && _set.count(it->get_sibling())) { _set.erase(it->get_sibling()); auto ret = _set.insert(it->parent()); _set.erase(it); it = ret.first; } else { ++it; } } } void encode(ceph::buffer::list& bl) const { ceph::encode(_set, bl); } void decode(ceph::buffer::list::const_iterator& p) { ceph::decode(_set, p); } }; WRITE_CLASS_ENCODER(fragset_t) inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) { return out << "fragset_t(" << fs.get() << ")"; } #endif