diff options
Diffstat (limited to 'src/crush/crush.h')
-rw-r--r-- | src/crush/crush.h | 549 |
1 files changed, 549 insertions, 0 deletions
diff --git a/src/crush/crush.h b/src/crush/crush.h new file mode 100644 index 00000000..dd08aa7b --- /dev/null +++ b/src/crush/crush.h @@ -0,0 +1,549 @@ +#ifndef CEPH_CRUSH_CRUSH_H +#define CEPH_CRUSH_CRUSH_H + +#ifdef __KERNEL__ +# include <linux/types.h> +#else +# include "crush_compat.h" +#endif + +/* + * CRUSH is a pseudo-random data distribution algorithm that + * efficiently distributes input values (typically, data objects) + * across a heterogeneous, structured storage cluster. + * + * The algorithm was originally described in detail in this paper + * (although the algorithm has evolved somewhat since then): + * + * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf + * + * LGPL2.1 + */ + + +#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ + +#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */ +#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */ + +#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u) +#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u) + +#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ +/** @ingroup API + * The equivalent of NULL for an item, i.e. the absence of an item. + */ +#define CRUSH_ITEM_NONE 0x7fffffff + +/* + * CRUSH uses user-defined "rules" to describe how inputs should be + * mapped to devices. A rule consists of sequence of steps to perform + * to generate the set of output devices. + */ +struct crush_rule_step { + __u32 op; + __s32 arg1; + __s32 arg2; +}; + +/** @ingroup API + */ +enum crush_opcodes { + /*! do nothing + */ + CRUSH_RULE_NOOP = 0, + CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ + CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ + /* arg2 = type */ + CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ + CRUSH_RULE_EMIT = 4, /* no args */ + CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, + CRUSH_RULE_CHOOSELEAF_INDEP = 7, + + CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ + CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ + CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, + CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, + CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 +}; + +/* + * for specifying choose num (arg1) relative to the max parameter + * passed to do_rule + */ +#define CRUSH_CHOOSE_N 0 +#define CRUSH_CHOOSE_N_MINUS(x) (-(x)) + +/* + * The rule mask is used to describe what the rule is intended for. + * Given a ruleset and size of output set, we search through the + * rule list for a matching rule_mask. + */ +struct crush_rule_mask { + __u8 ruleset; + __u8 type; + __u8 min_size; + __u8 max_size; +}; + +struct crush_rule { + __u32 len; + struct crush_rule_mask mask; + struct crush_rule_step steps[0]; +}; + +#define crush_rule_size(len) (sizeof(struct crush_rule) + \ + (len)*sizeof(struct crush_rule_step)) + + + +/* + * A bucket is a named container of other items (either devices or + * other buckets). + */ + +/** @ingroup API + * + * Items within a bucket are chosen with crush_do_rule() using one of + * three algorithms representing a tradeoff between performance and + * reorganization efficiency. If you are unsure of which bucket type + * to use, we recommend using ::CRUSH_BUCKET_STRAW2. + * + * The table summarizes how the speed of each option measures up + * against mapping stability when items are added or removed. + * + * Bucket Alg Speed Additions Removals + * ------------------------------------------------ + * uniform O(1) poor poor + * list O(n) optimal poor + * straw2 O(n) optimal optimal + */ +enum crush_algorithm { + /*! + * Devices are rarely added individually in a large system. + * Instead, new storage is typically deployed in blocks of identical + * devices, often as an additional shelf in a server rack or perhaps + * an entire cabinet. Devices reaching their end of life are often + * similarly decommissioned as a set (individual failures aside), + * making it natural to treat them as a unit. CRUSH uniform buckets + * are used to represent an identical set of devices in such + * circumstances. The key advantage in doing so is performance + * related: CRUSH can map replicas into uniform buckets in constant + * time. In cases where the uniformity restrictions are not + * appropriate, other bucket types can be used. If the size of a + * uniform bucket changes, there is a complete reshuffling of data + * between devices, much like conventional hash-based distribution + * strategies. + */ + CRUSH_BUCKET_UNIFORM = 1, + /*! + * List buckets structure their contents as a linked list, and + * can contain items with arbitrary weights. To place a + * replica, CRUSH begins at the head of the list with the most + * recently added item and compares its weight to the sum of + * all remaining items' weights. Depending on the value of + * hash( x , r , item), either the current item is chosen with + * the appropriate probability, or the process continues + * recursively down the list. This is a natural and intuitive + * choice for an expanding cluster: either an object is + * relocated to the newest device with some appropriate + * probability, or it remains on the older devices as before. + * The result is optimal data migration when items are added + * to the bucket. Items removed from the middle or tail of the + * list, however, can result in a significant amount of + * unnecessary movement, making list buckets most suitable for + * circumstances in which they never (or very rarely) shrink. + */ + CRUSH_BUCKET_LIST = 2, + /*! @cond INTERNAL */ + CRUSH_BUCKET_TREE = 3, + CRUSH_BUCKET_STRAW = 4, + /*! @endcond */ + /*! + * List and tree buckets are structured such that a limited + * number of hash values need to be calculated and compared to + * weights in order to select a bucket item. In doing so, + * they divide and conquer in a way that either gives certain + * items precedence (e. g., those at the beginning of a list) + * or obviates the need to consider entire subtrees of items + * at all. That improves the performance of the replica + * placement process, but can also introduce suboptimal + * reorganization behavior when the contents of a bucket + * change due an addition, removal, or re-weighting of an + * item. + * + * The straw2 bucket type allows all items to fairly "compete" + * against each other for replica placement through a process + * analogous to a draw of straws. To place a replica, a straw + * of random length is drawn for each item in the bucket. The + * item with the longest straw wins. The length of each straw + * is initially a value in a fixed range. Each straw length + * is scaled by a factor based on the item's weight so that + * heavily weighted items are more likely to win the draw. + * Although this process is almost twice as slow (on average) + * than a list bucket and even slower than a tree bucket + * (which scales logarithmically), straw2 buckets result in + * optimal data movement between nested items when modified. + */ + CRUSH_BUCKET_STRAW2 = 5, +}; +extern const char *crush_bucket_alg_name(int alg); + +/* + * although tree was a legacy algorithm, it has been buggy, so + * exclude it. + */ +#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \ + (1 << CRUSH_BUCKET_UNIFORM) | \ + (1 << CRUSH_BUCKET_LIST) | \ + (1 << CRUSH_BUCKET_STRAW)) + +/** @ingroup API + * + * A bucket contains __size__ __items__ which are either positive + * numbers or negative numbers that reference other buckets and is + * uniquely identified with __id__ which is a negative number. The + * __weight__ of a bucket is the cumulative weight of all its + * children. A bucket is assigned a ::crush_algorithm that is used by + * crush_do_rule() to draw an item depending on its weight. A bucket + * can be assigned a strictly positive (> 0) __type__ defined by the + * caller. The __type__ can be used by crush_do_rule(), when it is + * given as an argument of a rule step. + * + * A pointer to crush_bucket can safely be cast into the following + * structure, depending on the value of __alg__: + * + * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform + * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list + * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2 + * + * The weight of each item depends on the algorithm and the + * information about it is available in the corresponding structure + * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2). + * + * See crush_map for more information on how __id__ is used + * to reference the bucket. + */ +struct crush_bucket { + __s32 id; /*!< bucket identifier, < 0 and unique within a crush_map */ + __u16 type; /*!< > 0 bucket type, defined by the caller */ + __u8 alg; /*!< the item selection ::crush_algorithm */ + /*! @cond INTERNAL */ + __u8 hash; /* which hash function to use, CRUSH_HASH_* */ + /*! @endcond */ + __u32 weight; /*!< 16.16 fixed point cumulated children weight */ + __u32 size; /*!< size of the __items__ array */ + __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */ +}; + +/** @ingroup API + * + * Replacement weights for each item in a bucket. The size of the + * array must be exactly the size of the straw2 bucket, just as the + * item_weights array. + * + */ +struct crush_weight_set { + __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */ + __u32 size; /*!< size of the __weights__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for a given straw2 bucket, for + * placement purposes. + * + * When crush_do_rule() chooses the Nth item from a straw2 bucket, the + * replacement weights found at __weight_set[N]__ are used instead of + * the weights from __item_weights__. If __N__ is greater than + * __weight_set_positions__, the weights found at __weight_set_positions-1__ are + * used instead. For instance if __weight_set__ is: + * + * [ [ 0x10000, 0x20000 ], // position 0 + * [ 0x20000, 0x40000 ] ] // position 1 + * + * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ] + * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ] + * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ] + * etc. + * + */ +struct crush_choose_arg { + __s32 *ids; /*!< values to use instead of items */ + __u32 ids_size; /*!< size of the __ids__ array */ + struct crush_weight_set *weight_set; /*!< weight replacements for a given position */ + __u32 weight_set_positions; /*!< size of the __weight_set__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for each bucket in the crushmap. The + * __size__ of the __args__ array must be exactly the same as the + * __map->max_buckets__. + * + * The __crush_choose_arg__ at index N will be used when choosing + * an item from the bucket __map->buckets[N]__ bucket, provided it + * is a straw2 bucket. + * + */ +struct crush_choose_arg_map { + struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */ + __u32 size; /*!< size of the __args__ array */ +}; + +/** @ingroup API + * The weight of each item in the bucket when + * __h.alg__ == ::CRUSH_BUCKET_UNIFORM. + */ +struct crush_bucket_uniform { + struct crush_bucket h; /*!< generic bucket information */ + __u32 item_weight; /*!< 16.16 fixed point weight for each item */ +}; + +/** @ingroup API + * The weight of each item in the bucket when + * __h.alg__ == ::CRUSH_BUCKET_LIST. + * + * The weight of __h.items[i]__ is __item_weights[i]__ for i in + * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__ + * for j in [0,i[. + * + */ +struct crush_bucket_list { + struct crush_bucket h; /*!< generic bucket information */ + __u32 *item_weights; /*!< 16.16 fixed point weight for each item */ + __u32 *sum_weights; /*!< 16.16 fixed point sum of the weights */ +}; + +struct crush_bucket_tree { + struct crush_bucket h; /* note: h.size is _tree_ size, not number of + actual items */ + __u8 num_nodes; + __u32 *node_weights; +}; + +struct crush_bucket_straw { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *straws; /* 16-bit fixed point */ +}; + +/** @ingroup API + * The weight of each item in the bucket when + * __h.alg__ == ::CRUSH_BUCKET_STRAW2. + * + * The weight of __h.items[i]__ is __item_weights[i]__ for i in + * [0,__h.size__[. + */ +struct crush_bucket_straw2 { + struct crush_bucket h; /*!< generic bucket information */ + __u32 *item_weights; /*!< 16.16 fixed point weight for each item */ +}; + + + +/** @ingroup API + * + * A crush map define a hierarchy of crush_bucket that end with leaves + * (buckets and leaves are called items) and a set of crush_rule to + * map an integer to items with the crush_do_rule() function. + * + */ +struct crush_map { + /*! An array of crush_bucket pointers of size __max_buckets__. + * An element of the array may be NULL if the bucket was removed with + * crush_remove_bucket(). The buckets must be added with crush_add_bucket(). + * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i. + */ + struct crush_bucket **buckets; + /*! An array of crush_rule pointers of size __max_rules__. + * An element of the array may be NULL if the rule was removed (there is + * no API to do so but there may be one in the future). The rules must be added + * with crush_add_rule(). + */ + struct crush_rule **rules; + __s32 max_buckets; /*!< the size of __buckets__ */ + __u32 max_rules; /*!< the size of __rules__ */ + /*! The value of the highest item stored in the crush_map + 1 + */ + __s32 max_devices; + + /*! Backward compatibility tunable. It implements a bad solution + * and must always be set to 0 except for backward compatibility + * purposes + */ + __u32 choose_local_tries; + /*! Backward compatibility tunable. It implements a bad solution + * and must always be set to 0 except for backward compatibility + * purposes + */ + __u32 choose_local_fallback_tries; + /*! Tunable. The default value when the CHOOSE_TRIES or + * CHOOSELEAF_TRIES steps are omitted in a rule. See the + * documentation for crush_rule_set_step() for more + * information + */ + __u32 choose_total_tries; + /*! Backward compatibility tunable. It should always be set + * to 1 except for backward compatibility. Implemented in 2012 + * it was generalized late 2013 and is mostly unused except + * in one border case, reason why it must be set to 1. + * + * Attempt chooseleaf inner descent once for firstn mode; on + * reject retry outer descent. Note that this does *not* + * apply to a collision: in that case we will retry as we + * used to. + */ + __u32 chooseleaf_descend_once; + /*! Backward compatibility tunable. It is a fix for bad + * mappings implemented in 2014 at + * https://github.com/ceph/ceph/pull/1185. It should always + * be set to 1 except for backward compatibility. + * + * If non-zero, feed r into chooseleaf, bit-shifted right by + * (r-1) bits. a value of 1 is best for new clusters. for + * legacy clusters that want to limit reshuffling, a value of + * 3 or 4 will make the mappings line up a bit better with + * previous mappings. + */ + __u8 chooseleaf_vary_r; + + /*! Backward compatibility tunable. It is an improvement that + * avoids unnecessary mapping changes, implemented at + * https://github.com/ceph/ceph/pull/6572 and explained in + * this post: "chooseleaf may cause some unnecessary pg + * migrations" in October 2015 + * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html + * It should always be set to 1 except for backward compatibility. + */ + __u8 chooseleaf_stable; + + /*! @cond INTERNAL */ + /* This value is calculated after decode or construction by + the builder. It is exposed here (rather than having a + 'build CRUSH working space' function) so that callers can + reserve a static buffer, allocate space on the stack, or + otherwise avoid calling into the heap allocator if they + want to. The size of the working space depends on the map, + while the size of the scratch vector passed to the mapper + depends on the size of the desired result set. + + Nothing stops the caller from allocating both in one swell + foop and passing in two points, though. */ + size_t working_size; + +#ifndef __KERNEL__ + /*! @endcond */ + /*! Backward compatibility tunable. It is a fix for the straw + * scaler values for the straw algorithm which is deprecated + * (straw2 replaces it) implemented at + * https://github.com/ceph/ceph/pull/3057. It should always + * be set to 1 except for backward compatibility. + * + */ + __u8 straw_calc_version; + + /*! @cond INTERNAL */ + /* + * allowed bucket algs is a bitmask, here the bit positions + * are CRUSH_BUCKET_*. note that these are *bits* and + * CRUSH_BUCKET_* values are not, so we need to or together (1 + * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to + * minimize confusion (bucket type values start at 1). + */ + __u32 allowed_bucket_algs; + + __u32 *choose_tries; +#endif + /*! @endcond */ +}; + + +/* crush.c */ +/** @ingroup API + * + * Return the 16.16 fixed point weight of the item at __pos__ (zero + * based index) within the bucket __b__. If __pos__ is negative or + * greater or equal to the number of items in the bucket, return 0. + * + * @param b the bucket containing items + * @param pos the zero based index of the item + * + * @returns the 16.16 fixed point item weight + */ +extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos); +extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); +extern void crush_destroy_bucket_list(struct crush_bucket_list *b); +extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); +extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b); +/** @ingroup API + * + * Deallocate a bucket created via crush_add_bucket(). + * + * @param b the bucket to deallocate + */ +extern void crush_destroy_bucket(struct crush_bucket *b); +/** @ingroup API + * + * Deallocate a rule created via crush_add_rule(). + * + * @param r the rule to deallocate + */ +extern void crush_destroy_rule(struct crush_rule *r); +/** @ingroup API + * + * Deallocate the __map__, previously allocated with crush_create. + * + * @param map the crush map + */ +extern void crush_destroy(struct crush_map *map); + +static inline int crush_calc_tree_node(int i) +{ + return ((i+1) << 1)-1; +} + +static inline const char *crush_alg_name(int alg) +{ + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + return "uniform"; + case CRUSH_BUCKET_LIST: + return "list"; + case CRUSH_BUCKET_TREE: + return "tree"; + case CRUSH_BUCKET_STRAW: + return "straw"; + case CRUSH_BUCKET_STRAW2: + return "straw2"; + default: + return "unknown"; + } +} + +/* --------------------------------------------------------------------- + Private + --------------------------------------------------------------------- */ + +/* These data structures are private to the CRUSH implementation. They + are exposed in this header file because builder needs their + definitions to calculate the total working size. + + Moving this out of the crush map allow us to treat the CRUSH map as + immutable within the mapper and removes the requirement for a CRUSH + map lock. */ + +struct crush_work_bucket { + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; /* Permutation of the bucket's items */ +}; + +struct crush_work { + struct crush_work_bucket **work; /* Per-bucket working store */ +}; + +#endif |