diff options
Diffstat (limited to 'src/os/bluestore/AvlAllocator.cc')
-rw-r--r-- | src/os/bluestore/AvlAllocator.cc | 474 |
1 files changed, 474 insertions, 0 deletions
diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc new file mode 100644 index 000000000..5f17a3689 --- /dev/null +++ b/src/os/bluestore/AvlAllocator.cc @@ -0,0 +1,474 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "AvlAllocator.h" + +#include <limits> + +#include "common/config_proxy.h" +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "AvlAllocator " + +MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc); + +namespace { + // a light-weight "range_seg_t", which only used as the key when searching in + // range_tree and range_size_tree + struct range_t { + uint64_t start; + uint64_t end; + }; +} + +/* + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. + */ +uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, + uint64_t size, + uint64_t align) +{ + const auto compare = range_tree.key_comp(); + uint32_t search_count = 0; + uint64_t search_bytes = 0; + auto rs_start = range_tree.lower_bound(range_t{*cursor, size}, compare); + for (auto rs = rs_start; rs != range_tree.end(); ++rs) { + uint64_t offset = p2roundup(rs->start, align); + *cursor = offset + size; + if (offset + size <= rs->end) { + return offset; + } + if (max_search_count > 0 && ++search_count > max_search_count) { + return -1ULL; + } + if (search_bytes = rs->start - rs_start->start; + max_search_bytes > 0 && search_bytes > max_search_bytes) { + return -1ULL; + } + } + + if (*cursor == 0) { + // If we already started from beginning, don't bother with searching from beginning + return -1ULL; + } + // If we reached end, start from beginning till cursor. + for (auto rs = range_tree.begin(); rs != rs_start; ++rs) { + uint64_t offset = p2roundup(rs->start, align); + *cursor = offset + size; + if (offset + size <= rs->end) { + return offset; + } + if (max_search_count > 0 && ++search_count > max_search_count) { + return -1ULL; + } + if (max_search_bytes > 0 && search_bytes + rs->start > max_search_bytes) { + return -1ULL; + } + } + return -1ULL; +} + +uint64_t AvlAllocator::_pick_block_fits(uint64_t size, + uint64_t align) +{ + // instead of searching from cursor, just pick the smallest range which fits + // the needs + const auto compare = range_size_tree.key_comp(); + auto rs_start = range_size_tree.lower_bound(range_t{0, size}, compare); + for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) { + uint64_t offset = p2roundup(rs->start, align); + if (offset + size <= rs->end) { + return offset; + } + } + return -1ULL; +} + +void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size) +{ + ceph_assert(size != 0); + + uint64_t end = start + size; + + auto rs_after = range_tree.upper_bound(range_t{start, end}, + range_tree.key_comp()); + + /* Make sure we don't overlap with either of our neighbors */ + auto rs_before = range_tree.end(); + if (rs_after != range_tree.begin()) { + rs_before = std::prev(rs_after); + } + + bool merge_before = (rs_before != range_tree.end() && rs_before->end == start); + bool merge_after = (rs_after != range_tree.end() && rs_after->start == end); + + if (merge_before && merge_after) { + _range_size_tree_rm(*rs_before); + _range_size_tree_rm(*rs_after); + rs_after->start = rs_before->start; + range_tree.erase_and_dispose(rs_before, dispose_rs{}); + _range_size_tree_try_insert(*rs_after); + } else if (merge_before) { + _range_size_tree_rm(*rs_before); + rs_before->end = end; + _range_size_tree_try_insert(*rs_before); + } else if (merge_after) { + _range_size_tree_rm(*rs_after); + rs_after->start = start; + _range_size_tree_try_insert(*rs_after); + } else { + _try_insert_range(start, end, &rs_after); + } +} + +void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end, + AvlAllocator::range_tree_t::iterator& rs) +{ + bool left_over = (rs->start != start); + bool right_over = (rs->end != end); + + _range_size_tree_rm(*rs); + + if (left_over && right_over) { + auto old_right_end = rs->end; + auto insert_pos = rs; + ceph_assert(insert_pos != range_tree.end()); + ++insert_pos; + rs->end = start; + + // Insert tail first to be sure insert_pos hasn't been disposed. + // This woulnd't dispose rs though since it's out of range_size_tree. + // Don't care about a small chance of 'not-the-best-choice-for-removal' case + // which might happen if rs has the lowest size. + _try_insert_range(end, old_right_end, &insert_pos); + _range_size_tree_try_insert(*rs); + + } else if (left_over) { + rs->end = start; + _range_size_tree_try_insert(*rs); + } else if (right_over) { + rs->start = end; + _range_size_tree_try_insert(*rs); + } else { + range_tree.erase_and_dispose(rs, dispose_rs{}); + } +} + +void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size) +{ + uint64_t end = start + size; + + ceph_assert(size != 0); + ceph_assert(size <= num_free); + + auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp()); + /* Make sure we completely overlap with someone */ + ceph_assert(rs != range_tree.end()); + ceph_assert(rs->start <= start); + ceph_assert(rs->end >= end); + + _process_range_removal(start, end, rs); +} + +void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size, + std::function<void(uint64_t, uint64_t, bool)> cb) +{ + uint64_t end = start + size; + + ceph_assert(size != 0); + + auto rs = range_tree.find(range_t{ start, end }, + range_tree.key_comp()); + + if (rs == range_tree.end() || rs->start >= end) { + cb(start, size, false); + return; + } + + do { + + auto next_rs = rs; + ++next_rs; + + if (start < rs->start) { + cb(start, rs->start - start, false); + start = rs->start; + } + auto range_end = std::min(rs->end, end); + _process_range_removal(start, range_end, rs); + cb(start, range_end - start, true); + start = range_end; + + rs = next_rs; + } while (rs != range_tree.end() && rs->start < end && start < end); + if (start < end) { + cb(start, end - start, false); + } +} + +int64_t AvlAllocator::_allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, // unused, for now! + PExtentVector* extents) +{ + uint64_t allocated = 0; + while (allocated < want) { + uint64_t offset, length; + int r = _allocate(std::min(max_alloc_size, want - allocated), + unit, &offset, &length); + if (r < 0) { + // Allocation failed. + break; + } + extents->emplace_back(offset, length); + allocated += length; + } + return allocated ? allocated : -ENOSPC; +} + +int AvlAllocator::_allocate( + uint64_t size, + uint64_t unit, + uint64_t *offset, + uint64_t *length) +{ + uint64_t max_size = 0; + if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) { + max_size = p->end - p->start; + } + + bool force_range_size_alloc = false; + if (max_size < size) { + if (max_size < unit) { + return -ENOSPC; + } + size = p2align(max_size, unit); + ceph_assert(size > 0); + force_range_size_alloc = true; + } + + const int free_pct = num_free * 100 / num_total; + uint64_t start = 0; + // If we're running low on space, find a range by size by looking up in the size + // sorted tree (best-fit), instead of searching in the area pointed by cursor + if (force_range_size_alloc || + max_size < range_size_alloc_threshold || + free_pct < range_size_alloc_free_pct) { + start = -1ULL; + } else { + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area (i.e. same cursor bucket) but it does + * not guarantee that other allocations sizes may exist in the same + * region. + */ + uint64_t align = size & -size; + ceph_assert(align != 0); + uint64_t* cursor = &lbas[cbits(align) - 1]; + start = _pick_block_after(cursor, size, unit); + dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl; + } + if (start == -1ULL) { + do { + start = _pick_block_fits(size, unit); + dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl; + if (start != uint64_t(-1ULL)) { + break; + } + // try to collect smaller extents as we could fail to retrieve + // that large block due to misaligned extents + size = p2align(size >> 1, unit); + } while (size >= unit); + } + if (start == -1ULL) { + return -ENOSPC; + } + + _remove_from_tree(start, size); + + *offset = start; + *length = size; + return 0; +} + +void AvlAllocator::_release(const interval_set<uint64_t>& release_set) +{ + for (auto p = release_set.begin(); p != release_set.end(); ++p) { + const auto offset = p.get_start(); + const auto length = p.get_len(); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _add_to_tree(offset, length); + } +} + +void AvlAllocator::_release(const PExtentVector& release_set) { + for (auto& e : release_set) { + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << e.offset + << " length 0x" << e.length + << std::dec << dendl; + _add_to_tree(e.offset, e.length); + } +} + +void AvlAllocator::_shutdown() +{ + range_size_tree.clear(); + range_tree.clear_and_dispose(dispose_rs{}); +} + +AvlAllocator::AvlAllocator(CephContext* cct, + int64_t device_size, + int64_t block_size, + uint64_t max_mem, + const std::string& name) : + Allocator(name, device_size, block_size), + num_total(device_size), + block_size(block_size), + range_size_alloc_threshold( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")), + range_size_alloc_free_pct( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")), + max_search_count( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_ff_max_search_count")), + max_search_bytes( + cct->_conf.get_val<Option::size_t>("bluestore_avl_alloc_ff_max_search_bytes")), + range_count_cap(max_mem / sizeof(range_seg_t)), + cct(cct) +{} + +AvlAllocator::AvlAllocator(CephContext* cct, + int64_t device_size, + int64_t block_size, + const std::string& name) : + AvlAllocator(cct, device_size, block_size, 0 /* max_mem */, name) +{} + +AvlAllocator::~AvlAllocator() +{ + shutdown(); +} + +int64_t AvlAllocator::allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, // unused, for now! + PExtentVector* extents) +{ + ldout(cct, 10) << __func__ << std::hex + << " want 0x" << want + << " unit 0x" << unit + << " max_alloc_size 0x" << max_alloc_size + << " hint 0x" << hint + << std::dec << dendl; + ceph_assert(isp2(unit)); + ceph_assert(want % unit == 0); + + if (max_alloc_size == 0) { + max_alloc_size = want; + } + if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max(); + max_alloc_size >= cap) { + max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size); + } + std::lock_guard l(lock); + return _allocate(want, unit, max_alloc_size, hint, extents); +} + +void AvlAllocator::release(const interval_set<uint64_t>& release_set) { + std::lock_guard l(lock); + _release(release_set); +} + +uint64_t AvlAllocator::get_free() +{ + std::lock_guard l(lock); + return num_free; +} + +double AvlAllocator::get_fragmentation() +{ + std::lock_guard l(lock); + return _get_fragmentation(); +} + +void AvlAllocator::dump() +{ + std::lock_guard l(lock); + _dump(); +} + +void AvlAllocator::_dump() const +{ + ldout(cct, 0) << __func__ << " range_tree: " << dendl; + for (auto& rs : range_tree) { + ldout(cct, 0) << std::hex + << "0x" << rs.start << "~" << rs.end + << std::dec + << dendl; + } + ldout(cct, 0) << __func__ << " range_size_tree: " << dendl; + for (auto& rs : range_size_tree) { + ldout(cct, 0) << std::hex + << "0x" << rs.start << "~" << rs.end + << std::dec + << dendl; + } +} + +void AvlAllocator::foreach( + std::function<void(uint64_t offset, uint64_t length)> notify) +{ + std::lock_guard l(lock); + _foreach(notify); +} + +void AvlAllocator::_foreach( + std::function<void(uint64_t offset, uint64_t length)> notify) const +{ + for (auto& rs : range_tree) { + notify(rs.start, rs.end - rs.start); + } +} + +void AvlAllocator::init_add_free(uint64_t offset, uint64_t length) +{ + if (!length) + return; + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _add_to_tree(offset, length); +} + +void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length) +{ + if (!length) + return; + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _remove_from_tree(offset, length); +} + +void AvlAllocator::shutdown() +{ + std::lock_guard l(lock); + _shutdown(); +} |