summaryrefslogtreecommitdiffstats
path: root/src/include/frag.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/include/frag.h615
1 files changed, 615 insertions, 0 deletions
diff --git a/src/include/frag.h b/src/include/frag.h
new file mode 100644
index 000000000..ec18bddfb
--- /dev/null
+++ b/src/include/frag.h
@@ -0,0 +1,615 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FRAG_H
+#define CEPH_FRAG_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "buffer.h"
+#include "compact_map.h"
+
+#include "ceph_frag.h"
+#include "include/encoding.h"
+#include "include/ceph_assert.h"
+
+#include "common/dout.h"
+
+/*
+ *
+ * the goal here is to use a binary split strategy to partition a namespace.
+ * frag_t represents a particular fragment. bits() tells you the size of the
+ * fragment, and value() it's name. this is roughly analogous to an ip address
+ * and netmask.
+ *
+ * fragtree_t represents an entire namespace and it's partition. it essentially
+ * tells you where fragments are split into other fragments, and by how much
+ * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
+ *
+ * this vaguely resembles a btree, in that when a fragment becomes large or small
+ * we can split or merge, except that there is no guarantee of being balanced.
+ *
+ * presumably we are partitioning the output of a (perhaps specialized) hash
+ * function.
+ */
+
+/**
+ * frag_t
+ *
+ * description of an individual fragment. that is, a particular piece
+ * of the overall namespace.
+ *
+ * this is conceptually analogous to an ip address and netmask.
+ *
+ * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
+ *
+ * we write it as v/b, where v is a value and b is the number of bits.
+ * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
+ * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
+ *
+ * this makes the right most bit of v the "most significant", which is the
+ * opposite of what we usually see.
+ */
+
+/*
+ * TODO:
+ * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
+ * iteration efficient (see, e.g., try_assimilate_children()
+ * - rework frag_t so that we mask the left-most (most significant) bits instead of
+ * the right-most (least significant) bits. just because it's more intuitive, and
+ * matches the network/netmask concept.
+ */
+
+class frag_t {
+ /*
+ * encoding is dictated by frag_* functions in ceph_fs.h. use those
+ * helpers _exclusively_.
+ */
+public:
+ using _frag_t = uint32_t;
+
+ frag_t() = default;
+ frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
+ frag_t(_frag_t e) : _enc(e) { }
+
+ // constructors
+ void from_unsigned(unsigned e) { _enc = e; }
+
+ // accessors
+ unsigned value() const { return ceph_frag_value(_enc); }
+ unsigned bits() const { return ceph_frag_bits(_enc); }
+ unsigned mask() const { return ceph_frag_mask(_enc); }
+ unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
+
+ operator _frag_t() const { return _enc; }
+
+ // tests
+ bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
+ bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
+ bool is_root() const { return bits() == 0; }
+ frag_t parent() const {
+ ceph_assert(bits() > 0);
+ return frag_t(ceph_frag_parent(_enc));
+ }
+
+ // splitting
+ frag_t make_child(int i, int nb) const {
+ ceph_assert(i < (1<<nb));
+ return frag_t(ceph_frag_make_child(_enc, nb, i));
+ }
+ template<typename T>
+ void split(int nb, T& fragments) const {
+ ceph_assert(nb > 0);
+ unsigned nway = 1 << nb;
+ for (unsigned i=0; i<nway; i++)
+ fragments.push_back(make_child(i, nb));
+ }
+
+ // binary splitting
+ frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
+ frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
+
+ bool is_left() const { return ceph_frag_is_left_child(_enc); }
+ bool is_right() const { return ceph_frag_is_right_child(_enc); }
+ frag_t get_sibling() const {
+ ceph_assert(!is_root());
+ return frag_t(ceph_frag_sibling(_enc));
+ }
+
+ // sequencing
+ bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
+ bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
+ frag_t next() const {
+ ceph_assert(!is_rightmost());
+ return frag_t(ceph_frag_next(_enc));
+ }
+
+ // parse
+ bool parse(const char *s) {
+ int pvalue, pbits;
+ int r = sscanf(s, "%x/%d", &pvalue, &pbits);
+ if (r == 2) {
+ *this = frag_t(pvalue, pbits);
+ return true;
+ }
+ return false;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ceph::encode_raw(_enc, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ __u32 v;
+ ceph::decode_raw(v, p);
+ _enc = v;
+ }
+ bool operator<(const frag_t& b) const
+ {
+ if (value() != b.value())
+ return value() < b.value();
+ else
+ return bits() < b.bits();
+ }
+private:
+ _frag_t _enc = 0;
+};
+WRITE_CLASS_ENCODER(frag_t)
+
+inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
+{
+ //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
+ unsigned num = hb.bits();
+ if (num) {
+ unsigned val = hb.value();
+ for (unsigned bit = 23; num; num--, bit--)
+ out << ((val & (1<<bit)) ? '1':'0');
+ }
+ return out << '*';
+}
+
+
+using frag_vec_t = boost::container::small_vector<frag_t, 4>;
+
+/**
+ * fragtree_t -- partition an entire namespace into one or more frag_t's.
+ */
+class fragtree_t {
+ // pairs <f, b>:
+ // frag_t f is split by b bits.
+ // if child frag_t does not appear, it is not split.
+public:
+ compact_map<frag_t,int32_t> _splits;
+
+public:
+ // -------------
+ // basics
+ void swap(fragtree_t& other) {
+ _splits.swap(other._splits);
+ }
+ void clear() {
+ _splits.clear();
+ }
+
+ // -------------
+ // accessors
+ bool empty() const {
+ return _splits.empty();
+ }
+ int get_split(const frag_t hb) const {
+ compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
+ if (p == _splits.end())
+ return 0;
+ else
+ return p->second;
+ }
+
+
+ bool is_leaf(frag_t x) const {
+ frag_vec_t s;
+ get_leaves_under(x, s);
+ //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
+ return s.size() == 1 && s.front() == x;
+ }
+
+ /**
+ * get_leaves -- list all leaves
+ */
+ template<typename T>
+ void get_leaves(T& c) const {
+ return get_leaves_under_split(frag_t(), c);
+ }
+
+ /**
+ * get_leaves_under_split -- list all leaves under a known split point (or root)
+ */
+ template<typename T>
+ void get_leaves_under_split(frag_t under, T& c) const {
+ frag_vec_t s;
+ s.push_back(under);
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ int nb = get_split(t);
+ if (nb)
+ t.split(nb, s); // queue up children
+ else
+ c.push_back(t); // not spit, it's a leaf.
+ }
+ }
+
+ /**
+ * get_branch -- get branch point at OR above frag @a x
+ * - may be @a x itself, if @a x is a split
+ * - may be root (frag_t())
+ */
+ frag_t get_branch(frag_t x) const {
+ while (1) {
+ if (x == frag_t()) return x; // root
+ if (get_split(x)) return x; // found it!
+ x = x.parent();
+ }
+ }
+
+ /**
+ * get_branch_above -- get a branch point above frag @a x
+ * - may be root (frag_t())
+ * - may NOT be @a x, even if @a x is a split.
+ */
+ frag_t get_branch_above(frag_t x) const {
+ while (1) {
+ if (x == frag_t()) return x; // root
+ x = x.parent();
+ if (get_split(x)) return x; // found it!
+ }
+ }
+
+
+ /**
+ * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
+ * - may be @a x itself, if @a x is a split or leaf
+ * - may be root (frag_t())
+ */
+ frag_t get_branch_or_leaf(frag_t x) const {
+ frag_t branch = get_branch(x);
+ int nb = get_split(branch);
+ if (nb > 0 && // if branch is a split, and
+ branch.bits() + nb <= x.bits()) // one of the children is or contains x
+ return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
+ else
+ return branch;
+ }
+
+ /**
+ * get_leaves_under(x, ls) -- search for any leaves fully contained by x
+ */
+ template<typename T>
+ void get_leaves_under(frag_t x, T& c) const {
+ frag_vec_t s;
+ s.push_back(get_branch_or_leaf(x));
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ if (t.bits() >= x.bits() && // if t is more specific than x, and
+ !x.contains(t)) // x does not contain t,
+ continue; // then skip
+ int nb = get_split(t);
+ if (nb)
+ t.split(nb, s); // queue up children
+ else if (x.contains(t))
+ c.push_back(t); // not spit, it's a leaf.
+ }
+ }
+
+ /**
+ * contains(fg) -- does fragtree contain the specific frag @a x
+ */
+ bool contains(frag_t x) const {
+ frag_vec_t s;
+ s.push_back(get_branch(x));
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ if (t.bits() >= x.bits() && // if t is more specific than x, and
+ !x.contains(t)) // x does not contain t,
+ continue; // then skip
+ int nb = get_split(t);
+ if (nb) {
+ if (t == x) return false; // it's split.
+ t.split(nb, s); // queue up children
+ } else {
+ if (t == x) return true; // it's there.
+ }
+ }
+ return false;
+ }
+
+ /**
+ * operator[] -- map a (hash?) value to a frag
+ */
+ frag_t operator[](unsigned v) const {
+ frag_t t;
+ while (1) {
+ ceph_assert(t.contains(v));
+ int nb = get_split(t);
+
+ // is this a leaf?
+ if (nb == 0) return t; // done.
+
+ // pick appropriate child fragment.
+ unsigned nway = 1 << nb;
+ unsigned i;
+ for (i=0; i<nway; i++) {
+ frag_t n = t.make_child(i, nb);
+ if (n.contains(v)) {
+ t = n;
+ break;
+ }
+ }
+ ceph_assert(i < nway);
+ }
+ }
+
+
+ // ---------------
+ // modifiers
+ void split(frag_t x, int b, bool simplify=true) {
+ ceph_assert(is_leaf(x));
+ _splits[x] = b;
+
+ if (simplify)
+ try_assimilate_children(get_branch_above(x));
+ }
+ void merge(frag_t x, int b, bool simplify=true) {
+ ceph_assert(!is_leaf(x));
+ ceph_assert(_splits[x] == b);
+ _splits.erase(x);
+
+ if (simplify)
+ try_assimilate_children(get_branch_above(x));
+ }
+
+ /*
+ * if all of a given split's children are identically split,
+ * then the children can be assimilated.
+ */
+ void try_assimilate_children(frag_t x) {
+ int nb = get_split(x);
+ if (!nb) return;
+ frag_vec_t children;
+ x.split(nb, children);
+ int childbits = 0;
+ for (auto& frag : children) {
+ int cb = get_split(frag);
+ if (!cb) return; // nope.
+ if (childbits && cb != childbits) return; // not the same
+ childbits = cb;
+ }
+ // all children are split with childbits!
+ for (auto& frag : children)
+ _splits.erase(frag);
+ _splits[x] += childbits;
+ }
+
+ bool force_to_leaf(CephContext *cct, frag_t x) {
+ if (is_leaf(x))
+ return false;
+
+ lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
+
+ frag_t parent = get_branch_or_leaf(x);
+ ceph_assert(parent.bits() <= x.bits());
+ lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
+
+ // do we need to split from parent to x?
+ if (parent.bits() < x.bits()) {
+ int spread = x.bits() - parent.bits();
+ int nb = get_split(parent);
+ lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
+ if (nb == 0) {
+ // easy: split parent (a leaf) by the difference
+ lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
+ split(parent, spread);
+ ceph_assert(is_leaf(x));
+ return true;
+ }
+ ceph_assert(nb > spread);
+
+ // add an intermediary split
+ merge(parent, nb, false);
+ split(parent, spread, false);
+
+ frag_vec_t subs;
+ parent.split(spread, subs);
+ for (auto& frag : subs) {
+ lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
+ split(frag, nb - spread, false);
+ }
+ }
+
+ // x is now a leaf or split.
+ // hoover up any children.
+ frag_vec_t s;
+ s.push_back(x);
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ int nb = get_split(t);
+ if (nb) {
+ lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
+ merge(t, nb, false); // merge this point, and
+ t.split(nb, s); // queue up children
+ }
+ }
+
+ lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
+ ceph_assert(is_leaf(x));
+ return true;
+ }
+
+ // encoding
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(_splits, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(_splits, p);
+ }
+ void encode_nohead(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+ p != _splits.end();
+ ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+ }
+ void decode_nohead(int n, ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ _splits.clear();
+ while (n-- > 0) {
+ frag_t f;
+ decode(f, p);
+ decode(_splits[f], p);
+ }
+ }
+
+ void print(std::ostream& out) {
+ out << "fragtree_t(";
+ frag_vec_t s;
+ s.push_back(frag_t());
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ // newline + indent?
+ if (t.bits()) {
+ out << std::endl;
+ for (unsigned i=0; i<t.bits(); i++) out << ' ';
+ }
+ int nb = get_split(t);
+ if (nb) {
+ out << t << " %" << nb;
+ t.split(nb, s); // queue up children
+ } else {
+ out << t;
+ }
+ }
+ out << ")";
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->open_array_section("splits");
+ for (auto p = _splits.begin(); p != _splits.end(); ++p) {
+ f->open_object_section("split");
+ std::ostringstream frag_str;
+ frag_str << p->first;
+ f->dump_string("frag", frag_str.str());
+ f->dump_int("children", p->second);
+ f->close_section(); // split
+ }
+ f->close_section(); // splits
+ }
+};
+WRITE_CLASS_ENCODER(fragtree_t)
+
+inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
+ return l._splits == r._splits;
+}
+inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
+ return l._splits != r._splits;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
+{
+ out << "fragtree_t(";
+
+ for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
+ p != ft._splits.end();
+ ++p) {
+ if (p != ft._splits.begin())
+ out << " ";
+ out << p->first << "^" << p->second;
+ }
+ return out << ")";
+}
+
+/**
+ * fragset_t -- a set of fragments
+ */
+class fragset_t {
+ std::set<frag_t> _set;
+
+public:
+ const std::set<frag_t> &get() const { return _set; }
+ std::set<frag_t>::const_iterator begin() const { return _set.begin(); }
+ std::set<frag_t>::const_iterator end() const { return _set.end(); }
+
+ bool empty() const { return _set.empty(); }
+
+ bool contains(frag_t f) const {
+ while (1) {
+ if (_set.count(f)) return true;
+ if (f.bits() == 0) return false;
+ f = f.parent();
+ }
+ }
+
+ void clear() {
+ _set.clear();
+ }
+
+ void insert_raw(frag_t f){
+ _set.insert(f);
+ }
+ void insert(frag_t f) {
+ _set.insert(f);
+ simplify();
+ }
+
+ void simplify() {
+ auto it = _set.begin();
+ while (it != _set.end()) {
+ if (!it->is_root() &&
+ _set.count(it->get_sibling())) {
+ _set.erase(it->get_sibling());
+ auto ret = _set.insert(it->parent());
+ _set.erase(it);
+ it = ret.first;
+ } else {
+ ++it;
+ }
+ }
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ceph::encode(_set, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ ceph::decode(_set, p);
+ }
+};
+WRITE_CLASS_ENCODER(fragset_t)
+
+
+inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
+{
+ return out << "fragset_t(" << fs.get() << ")";
+}
+
+#endif