diff options
Diffstat (limited to 'src/vulkan/malloc.c')
-rw-r--r-- | src/vulkan/malloc.c | 1058 |
1 files changed, 1058 insertions, 0 deletions
diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c new file mode 100644 index 0000000..c35183b --- /dev/null +++ b/src/vulkan/malloc.c @@ -0,0 +1,1058 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "malloc.h" +#include "command.h" +#include "utils.h" +#include "pl_thread.h" + +#ifdef PL_HAVE_UNIX +#include <errno.h> +#include <unistd.h> +#endif + +// Controls the page size alignment, to help coalesce allocations into the same +// slab. Pages are rounded up to multiples of this value. (Default: 4 KB) +#define PAGE_SIZE_ALIGN (1LLU << 12) + +// Controls the minimum/maximum number of pages for new slabs. As slabs are +// exhausted of memory, the number of pages per new slab grows exponentially, +// starting with the minimum until the maximum is reached. +// +// Note: The maximum must never exceed the size of `vk_slab.spacemap`. +#define MINIMUM_PAGE_COUNT 4 +#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8) + +// Controls the maximum page size. Any allocations above this threshold +// (absolute size or fraction of VRAM, whichever is higher) will be served by +// dedicated allocations. (Default: 64 MB or 1/16 of VRAM) +#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26) +#define MAXIMUM_PAGE_SIZE_RELATIVE 16 + +// Controls the minimum slab size, to avoid excessive re-allocation of very +// small slabs. (Default: 256 KB) +#define MINIMUM_SLAB_SIZE (1LLU << 18) + +// How long to wait before garbage collecting empty slabs. Slabs older than +// this many invocations of `vk_malloc_garbage_collect` will be released. +#define MAXIMUM_SLAB_AGE 32 + +// A single slab represents a contiguous region of allocated memory. Actual +// allocations are served as pages of this. Slabs are organized into pools, +// each of which contains a list of slabs of differing page sizes. +struct vk_slab { + pl_mutex lock; + pl_debug_tag debug_tag; // debug tag of the triggering allocation + VkDeviceMemory mem; // underlying device allocation + VkDeviceSize size; // total allocated size of `mem` + VkMemoryType mtype; // underlying memory type + bool dedicated; // slab is allocated specifically for one object + bool imported; // slab represents an imported memory allocation + + // free space accounting (only for non-dedicated slabs) + uint64_t spacemap; // bitset of available pages + size_t pagesize; // size in bytes per page + size_t used; // number of bytes actually in use + uint64_t age; // timestamp of last use + + // optional, depends on the memory type: + VkBuffer buffer; // buffer spanning the entire slab + void *data; // mapped memory corresponding to `mem` + bool coherent; // mapped memory is coherent + union pl_handle handle; // handle associated with this device memory + enum pl_handle_type handle_type; +}; + +// Represents a single memory pool. We keep track of a vk_pool for each +// combination of malloc parameters. This shouldn't actually be that many in +// practice, because some combinations simply never occur, and others will +// generally be the same for the same objects. +// +// Note: `vk_pool` addresses are not immutable, so we mustn't expose any +// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`. +struct vk_pool { + struct vk_malloc_params params; // allocation params (with some fields nulled) + PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted + int index; // running index in `vk_malloc.pools` +}; + +// The overall state of the allocator, which keeps track of a vk_pool for each +// memory type. +struct vk_malloc { + struct vk_ctx *vk; + pl_mutex lock; + VkPhysicalDeviceMemoryProperties props; + size_t maximum_page_size; + PL_ARRAY(struct vk_pool) pools; + uint64_t age; +}; + +static inline float efficiency(size_t used, size_t total) +{ + if (!total) + return 100.0; + + return 100.0f * used / total; +} + +static const char *print_size(char buf[8], size_t size) +{ + const char *suffixes = "\0KMG"; + while (suffixes[1] && size > 9999) { + size >>= 10; + suffixes++; + } + + int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes) + : snprintf(buf, 8, "%5zu", size); + + return ret >= 0 ? buf : "(error)"; +} + +#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x))) + +void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev) +{ + struct vk_ctx *vk = ma->vk; + size_t total_size = 0; + size_t total_used = 0; + size_t total_res = 0; + + PL_MSG(vk, lev, "Memory heaps supported by device:"); + for (int i = 0; i < ma->props.memoryHeapCount; i++) { + VkMemoryHeap heap = ma->props.memoryHeaps[i]; + PL_MSG(vk, lev, " %d: flags 0x%x size %s", + i, (unsigned) heap.flags, PRINT_SIZE(heap.size)); + } + + PL_DEBUG(vk, "Memory types supported by device:"); + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + VkMemoryType type = ma->props.memoryTypes[i]; + PL_DEBUG(vk, " %d: flags 0x%x heap %d", + i, (unsigned) type.propertyFlags, (int) type.heapIndex); + } + + pl_mutex_lock(&ma->lock); + for (int i = 0; i < ma->pools.num; i++) { + struct vk_pool *pool = &ma->pools.elem[i]; + const struct vk_malloc_params *par = &pool->params; + + PL_MSG(vk, lev, "Memory pool %d:", i); + PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits); + if (par->required) + PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required); + if (par->optimal) + PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal); + if (par->buf_usage) + PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage); + if (par->export_handle) + PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle); + + size_t pool_size = 0; + size_t pool_used = 0; + size_t pool_res = 0; + + for (int j = 0; j < pool->slabs.num; j++) { + struct vk_slab *slab = pool->slabs.elem[j]; + pl_mutex_lock(&slab->lock); + + size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize; + size_t slab_res = slab->size - avail; + + PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: " + "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]", + j, slab->spacemap, PRINT_SIZE(slab->pagesize), + PRINT_SIZE(slab->used), PRINT_SIZE(slab_res), + PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex, + efficiency(slab->used, slab_res), + PL_DEF(slab->debug_tag, "unknown")); + + pool_size += slab->size; + pool_used += slab->used; + pool_res += slab_res; + pl_mutex_unlock(&slab->lock); + } + + PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, " + "efficiency %.2f%%, utilization %.2f%%", + PRINT_SIZE(pool_used), PRINT_SIZE(pool_res), + PRINT_SIZE(pool_size), efficiency(pool_used, pool_res), + efficiency(pool_res, pool_size)); + + total_size += pool_size; + total_used += pool_used; + total_res += pool_res; + } + pl_mutex_unlock(&ma->lock); + + PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, " + "efficiency %.2f%%, utilization %.2f%%, max page: %s", + PRINT_SIZE(total_used), PRINT_SIZE(total_res), + PRINT_SIZE(total_size), efficiency(total_used, total_res), + efficiency(total_res, total_size), + PRINT_SIZE(ma->maximum_page_size)); +} + +static void slab_free(struct vk_ctx *vk, struct vk_slab *slab) +{ + if (!slab) + return; + +#ifndef NDEBUG + if (!slab->dedicated && slab->used > 0) { + PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used); + PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64, + (size_t) slab->size, (int) slab->mtype.heapIndex, + (uint64_t) slab->mtype.propertyFlags); + if (slab->debug_tag) + PL_WARN(vk, "last used for: %s", slab->debug_tag); + pl_log_stack_trace(vk->log, PL_LOG_WARN); + pl_debug_abort(); + } +#endif + + if (slab->imported) { + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: + PL_TRACE(vk, "Unimporting slab of size %s from fd: %d", + PRINT_SIZE(slab->size), slab->handle.fd); + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: +#ifdef PL_HAVE_WIN32 + PL_TRACE(vk, "Unimporting slab of size %s from handle: %p", + PRINT_SIZE(slab->size), (void *) slab->handle.handle); +#endif + break; + case PL_HANDLE_HOST_PTR: + PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p", + PRINT_SIZE(slab->size), (void *) slab->handle.ptr); + break; + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + pl_unreachable(); + } + } else { + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: +#ifdef PL_HAVE_UNIX + if (slab->handle.fd > -1) + close(slab->handle.fd); +#endif + break; + case PL_HANDLE_WIN32: +#ifdef PL_HAVE_WIN32 + if (slab->handle.handle != NULL) + CloseHandle(slab->handle.handle); +#endif + break; + case PL_HANDLE_WIN32_KMT: + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. + break; + case PL_HANDLE_HOST_PTR: + // Implicitly unmapped + break; + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + pl_unreachable(); + } + + PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size)); + } + + vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC); + // also implicitly unmaps the memory if needed + vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC); + + pl_mutex_destroy(&slab->lock); + pl_free(slab); +} + +// type_mask: optional +// thread-safety: safe +static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask, + const struct vk_malloc_params *params, + uint32_t *out_index) +{ + struct vk_ctx *vk = ma->vk; + int best = -1; + + // The vulkan spec requires memory types to be sorted in the "optimal" + // order, so the first matching type we find will be the best/fastest one. + // That being said, we still want to prioritize memory types that have + // better optional flags. + + type_mask &= params->reqs.memoryTypeBits; + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + const VkMemoryType *mtype = &ma->props.memoryTypes[i]; + + // The memory type flags must include our properties + if ((mtype->propertyFlags & params->required) != params->required) + continue; + + // The memory heap must be large enough for the allocation + VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size; + if (params->reqs.size > heapSize) + continue; + + // The memory type must be supported by the type mask (bitfield) + if (!(type_mask & (1LU << i))) + continue; + + // Calculate the score as the number of optimal property flags matched + int score = __builtin_popcountl(mtype->propertyFlags & params->optimal); + if (score > best) { + *out_index = i; + best = score; + } + } + + if (best < 0) { + PL_ERR(vk, "Found no memory type matching property flags 0x%x and type " + "bits 0x%x!", + (unsigned) params->required, (unsigned) type_mask); + return false; + } + + return true; +} + +static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage, + enum pl_handle_type handle_type, bool import) +{ + if (!handle_type) + return true; + + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR, + .usage = usage, + .handleType = vk_mem_handle_type(handle_type), + }; + + VkExternalBufferProperties props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR, + }; + + if (!info.handleType) + return false; + + vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props); + return vk_external_mem_check(vk, &props.externalMemoryProperties, + handle_type, import); +} + +// thread-safety: safe +static struct vk_slab *slab_alloc(struct vk_malloc *ma, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + struct vk_slab *slab = pl_alloc_ptr(NULL, slab); + *slab = (struct vk_slab) { + .age = ma->age, + .size = params->reqs.size, + .handle_type = params->export_handle, + .debug_tag = params->debug_tag, + }; + pl_mutex_init(&slab->lock); + + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: + slab->handle.fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + slab->handle.handle = NULL; + break; + case PL_HANDLE_HOST_PTR: + slab->handle.ptr = NULL; + break; + } + + VkExportMemoryAllocateInfoKHR ext_info = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR, + .handleTypes = vk_mem_handle_type(slab->handle_type), + }; + + uint32_t type_mask = UINT32_MAX; + if (params->buf_usage) { + // Queue family sharing modes don't matter for buffers, so we just + // set them as concurrent and stop worrying about it. + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, + .handleTypes = ext_info.handleTypes, + }; + + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = slab->handle_type ? &ext_buf_info : NULL, + .size = slab->size, + .usage = params->buf_usage, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) { + PL_ERR(vk, "Failed allocating shared memory buffer: possibly " + "the handle type is unsupported?"); + goto error; + } + + VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer)); + PL_VK_NAME(BUFFER, slab->buffer, "slab"); + + VkMemoryRequirements reqs = {0}; + vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); + slab->size = reqs.size; // this can be larger than `slab->size` + type_mask = reqs.memoryTypeBits; + + // Note: we can ignore `reqs.align` because we always bind the buffer + // memory to offset 0 + } + + VkMemoryAllocateInfo minfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = slab->size, + }; + + if (params->export_handle) + vk_link_struct(&minfo, &ext_info); + + VkMemoryDedicatedAllocateInfoKHR dinfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, + .image = params->ded_image, + }; + + if (params->ded_image) + vk_link_struct(&minfo, &dinfo); + + if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex)) + goto error; + + const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex]; + PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s", + (size_t) slab->size, (unsigned) mtype->propertyFlags, + (int) minfo.memoryTypeIndex, (int) mtype->heapIndex, + PL_DEF(params->debug_tag, "unknown")); + + pl_clock_t start = pl_clock_now(); + + VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem); + switch (res) { + case VK_ERROR_OUT_OF_DEVICE_MEMORY: + case VK_ERROR_OUT_OF_HOST_MEMORY: + PL_ERR(vk, "Allocation of size %s failed: %s!", + PRINT_SIZE(slab->size), vk_res_str(res)); + vk_malloc_print_stats(ma, PL_LOG_ERR); + pl_log_stack_trace(vk->log, PL_LOG_ERR); + pl_debug_abort(); + goto error; + + default: + PL_VK_ASSERT(res, "vkAllocateMemory"); + } + + slab->mtype = *mtype; + if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + } + + if (slab->buffer) + VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); + +#ifdef PL_HAVE_UNIX + if (slab->handle_type == PL_HANDLE_FD || + slab->handle_type == PL_HANDLE_DMA_BUF) + { + VkMemoryGetFdInfoKHR fd_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, + .memory = slab->mem, + .handleType = ext_info.handleTypes, + }; + + VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (slab->handle_type == PL_HANDLE_WIN32 || + slab->handle_type == PL_HANDLE_WIN32_KMT) + { + VkMemoryGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + .memory = slab->mem, + .handleType = ext_info.handleTypes, + }; + + VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info, + &slab->handle.handle)); + } +#endif + + pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab"); + + // free space accounting is done by the caller + return slab; + +error: + if (params->debug_tag) + PL_ERR(vk, " for malloc: %s", params->debug_tag); + slab_free(vk, slab); + return NULL; +} + +static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool) +{ + for (int i = 0; i < pool->slabs.num; i++) + slab_free(vk, pool->slabs.elem[i]); + + pl_free(pool->slabs.elem); + *pool = (struct vk_pool) {0}; +} + +struct vk_malloc *vk_malloc_create(struct vk_ctx *vk) +{ + struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma); + pl_mutex_init(&ma->lock); + vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props); + ma->vk = vk; + + // Determine maximum page size + ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE; + for (int i = 0; i < ma->props.memoryHeapCount; i++) { + VkMemoryHeap heap = ma->props.memoryHeaps[i]; + if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { + size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE; + ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max); + } + } + + vk_malloc_print_stats(ma, PL_LOG_INFO); + return ma; +} + +void vk_malloc_destroy(struct vk_malloc **ma_ptr) +{ + struct vk_malloc *ma = *ma_ptr; + if (!ma) + return; + + vk_malloc_print_stats(ma, PL_LOG_DEBUG); + for (int i = 0; i < ma->pools.num; i++) + pool_uninit(ma->vk, &ma->pools.elem[i]); + + pl_mutex_destroy(&ma->lock); + pl_free_ptr(ma_ptr); +} + +void vk_malloc_garbage_collect(struct vk_malloc *ma) +{ + struct vk_ctx *vk = ma->vk; + + pl_mutex_lock(&ma->lock); + ma->age++; + + for (int i = 0; i < ma->pools.num; i++) { + struct vk_pool *pool = &ma->pools.elem[i]; + for (int n = 0; n < pool->slabs.num; n++) { + struct vk_slab *slab = pool->slabs.elem[n]; + pl_mutex_lock(&slab->lock); + if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) { + pl_mutex_unlock(&slab->lock); + continue; + } + + PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d", + PRINT_SIZE(slab->size), pool->index); + + pl_mutex_unlock(&slab->lock); + slab_free(ma->vk, slab); + PL_ARRAY_REMOVE_AT(pool->slabs, n--); + } + } + + pl_mutex_unlock(&ma->lock); +} + +pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import) +{ + struct vk_ctx *vk = ma->vk; + pl_handle_caps caps = 0; + + for (int i = 0; vk_mem_handle_list[i]; i++) { + // Try seeing if we could allocate a "basic" buffer using these + // capabilities, with no fancy buffer usage. More specific checks will + // happen down the line at VkBuffer creation time, but this should give + // us a rough idea of what the driver supports. + enum pl_handle_type type = vk_mem_handle_list[i]; + if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import)) + caps |= type; + } + + return caps; +} + +void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice) +{ + struct vk_ctx *vk = ma->vk; + struct vk_slab *slab = slice->priv; + if (!slab || slab->dedicated) { + slab_free(vk, slab); + goto done; + } + + pl_mutex_lock(&slab->lock); + + int page_idx = slice->offset / slab->pagesize; + slab->spacemap |= 0x1LLU << page_idx; + slab->used -= slice->size; + slab->age = ma->age; + pl_assert(slab->used >= 0); + + pl_mutex_unlock(&slab->lock); + +done: + *slice = (struct vk_memslice) {0}; +} + +static inline bool pool_params_eq(const struct vk_malloc_params *a, + const struct vk_malloc_params *b) +{ + return a->reqs.size == b->reqs.size && + a->reqs.alignment == b->reqs.alignment && + a->reqs.memoryTypeBits == b->reqs.memoryTypeBits && + a->required == b->required && + a->optimal == b->optimal && + a->buf_usage == b->buf_usage && + a->export_handle == b->export_handle; +} + +static struct vk_pool *find_pool(struct vk_malloc *ma, + const struct vk_malloc_params *params) +{ + pl_assert(!params->import_handle); + pl_assert(!params->ded_image); + + struct vk_malloc_params fixed = *params; + fixed.reqs.alignment = 0; + fixed.reqs.size = 0; + fixed.shared_mem = (struct pl_shared_mem) {0}; + + for (int i = 0; i < ma->pools.num; i++) { + if (pool_params_eq(&ma->pools.elem[i].params, &fixed)) + return &ma->pools.elem[i]; + } + + // Not found => add it + PL_ARRAY_GROW(ma, ma->pools); + size_t idx = ma->pools.num++; + ma->pools.elem[idx] = (struct vk_pool) { + .params = fixed, + .index = idx, + }; + return &ma->pools.elem[idx]; +} + +// Returns a suitable memory page from the pool. A new slab will be allocated +// under the hood, if necessary. +// +// Note: This locks the slab it returns +static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool, + size_t size, size_t align, + VkDeviceSize *offset) +{ + struct vk_slab *slab = NULL; + int slab_pages = MINIMUM_PAGE_COUNT; + size = PL_ALIGN2(size, PAGE_SIZE_ALIGN); + const size_t pagesize = PL_ALIGN(size, align); + + for (int i = 0; i < pool->slabs.num; i++) { + slab = pool->slabs.elem[i]; + if (slab->pagesize < size) + continue; + if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic + continue; + if (slab->pagesize % align) + continue; + + pl_mutex_lock(&slab->lock); + int page_idx = __builtin_ffsll(slab->spacemap); + if (!page_idx--) { + pl_mutex_unlock(&slab->lock); + // Increase the number of slabs to allocate for new slabs the + // more existing full slabs exist for this size range + slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT); + continue; + } + + slab->spacemap ^= 0x1LLU << page_idx; + *offset = page_idx * slab->pagesize; + return slab; + } + + // Otherwise, allocate a new vk_slab and append it to the list. + VkDeviceSize slab_size = slab_pages * pagesize; + pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT); + const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT; + pl_assert(pagesize <= ma->maximum_page_size); + slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size); + slab_pages = slab_size / pagesize; + slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess + + struct vk_malloc_params params = pool->params; + params.reqs.size = slab_size; + + // Don't hold the lock while allocating the slab, because it can be a + // potentially very costly operation. + pl_mutex_unlock(&ma->lock); + slab = slab_alloc(ma, ¶ms); + pl_mutex_lock(&ma->lock); + if (!slab) + return NULL; + pl_mutex_lock(&slab->lock); + + slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages); + slab->pagesize = pagesize; + PL_ARRAY_APPEND(NULL, pool->slabs, slab); + + // Return the first page in this newly allocated slab + slab->spacemap ^= 0x1; + *offset = 0; + return slab; +} + +static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type; + vk_handle_type = vk_mem_handle_type(params->import_handle); + + struct vk_slab *slab = NULL; + const struct pl_shared_mem *shmem = ¶ms->shared_mem; + + VkMemoryDedicatedAllocateInfoKHR dinfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, + .image = params->ded_image, + }; + + VkImportMemoryFdInfoKHR fdinfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = vk_handle_type, + .fd = -1, + }; + + VkImportMemoryHostPointerInfoEXT ptrinfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT, + .handleType = vk_handle_type, + }; + + VkMemoryAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = shmem->size, + }; + + if (params->ded_image) + vk_link_struct(&ainfo, &dinfo); + + VkBuffer buffer = VK_NULL_HANDLE; + VkMemoryRequirements reqs = params->reqs; + + if (params->buf_usage) { + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, + .handleTypes = vk_handle_type, + }; + + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = &ext_buf_info, + .size = shmem->size, + .usage = params->buf_usage, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer)); + PL_VK_NAME(BUFFER, buffer, "imported"); + + vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs); + } + + if (reqs.size > shmem->size) { + PL_ERR(vk, "Imported object requires %zu bytes, larger than the " + "provided size %zu!", + (size_t) reqs.size, shmem->size); + goto error; + } + + if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) { + PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!", + shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment)); + goto error; + } + + switch (params->import_handle) { +#ifdef PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: { + if (!vk->GetMemoryFdPropertiesKHR) { + PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.", + VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME); + goto error; + } + + VkMemoryFdPropertiesKHR fdprops = { + .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR, + }; + + VK(vk->GetMemoryFdPropertiesKHR(vk->dev, + vk_handle_type, + shmem->handle.fd, + &fdprops)); + + // We dup() the fd to make it safe to import the same original fd + // multiple times. + fdinfo.fd = dup(shmem->handle.fd); + if (fdinfo.fd == -1) { + PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s", + fdinfo.fd, strerror(errno)); + goto error; + } + + reqs.memoryTypeBits &= fdprops.memoryTypeBits; + vk_link_struct(&ainfo, &fdinfo); + break; + } +#else // !PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: + PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!"); + goto error; +#endif + + case PL_HANDLE_HOST_PTR: { + VkMemoryHostPointerPropertiesEXT ptrprops = { + .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT, + }; + + VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type, + shmem->handle.ptr, + &ptrprops)); + + ptrinfo.pHostPointer = (void *) shmem->handle.ptr; + reqs.memoryTypeBits &= ptrprops.memoryTypeBits; + vk_link_struct(&ainfo, &ptrinfo); + break; + } + + case PL_HANDLE_FD: + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + PL_ERR(vk, "vk_malloc_import: unsupported handle type %d", + params->import_handle); + goto error; + } + + if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) { + PL_ERR(vk, "No compatible memory types offered for imported memory!"); + goto error; + } + + VkDeviceMemory vkmem = VK_NULL_HANDLE; + VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem)); + + slab = pl_alloc_ptr(NULL, slab); + *slab = (struct vk_slab) { + .mem = vkmem, + .dedicated = true, + .imported = true, + .buffer = buffer, + .size = shmem->size, + .handle_type = params->import_handle, + }; + pl_mutex_init(&slab->lock); + + *out = (struct vk_memslice) { + .vkmem = vkmem, + .buf = buffer, + .size = shmem->size - shmem->offset, + .offset = shmem->offset, + .shared_mem = *shmem, + .priv = slab, + }; + + switch (params->import_handle) { + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_FD: + PL_TRACE(vk, "Imported %s bytes from fd: %d%s", + PRINT_SIZE(slab->size), shmem->handle.fd, + params->ded_image ? " (dedicated)" : ""); + // fd ownership is transferred at this point. + slab->handle.fd = fdinfo.fd; + fdinfo.fd = -1; + break; + case PL_HANDLE_HOST_PTR: + PL_TRACE(vk, "Imported %s bytes from ptr: %p%s", + PRINT_SIZE(slab->size), shmem->handle.ptr, + params->ded_image ? " (dedicated" : ""); + slab->handle.ptr = ptrinfo.pHostPointer; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + break; + } + + VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags; + if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + out->data = (uint8_t *) slab->data + out->offset; + out->coherent = slab->coherent; + if (!slab->coherent) { + // Use entire buffer range, since this is a dedicated memory + // allocation. This avoids issues with noncoherent atomicity + out->map_offset = 0; + out->map_size = VK_WHOLE_SIZE; + + // Mapping does not implicitly invalidate mapped memory + VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = slab->mem, + .offset = out->map_offset, + .size = out->map_size, + })); + } + } + + if (buffer) + VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0)); + + return true; + +error: + if (params->debug_tag) + PL_ERR(vk, " for malloc: %s", params->debug_tag); + vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC); +#ifdef PL_HAVE_UNIX + if (fdinfo.fd > -1) + close(fdinfo.fd); +#endif + pl_free(slab); + *out = (struct vk_memslice) {0}; + return false; +} + +size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags) +{ + size_t avail = 0; + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + const VkMemoryType *mtype = &ma->props.memoryTypes[i]; + if ((mtype->propertyFlags & flags) != flags) + continue; + avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size); + } + + return avail; +} + +bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + pl_assert(!params->import_handle || !params->export_handle); + if (params->import_handle) + return vk_malloc_import(ma, out, params); + + pl_assert(params->reqs.size); + size_t size = params->reqs.size; + size_t align = params->reqs.alignment; + align = pl_lcm(align, vk->props.limits.bufferImageGranularity); + align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize); + + struct vk_slab *slab; + VkDeviceSize offset; + + if (params->ded_image || size > ma->maximum_page_size) { + slab = slab_alloc(ma, params); + if (!slab) + return false; + slab->dedicated = true; + offset = 0; + } else { + pl_mutex_lock(&ma->lock); + struct vk_pool *pool = find_pool(ma, params); + slab = pool_get_page(ma, pool, size, align, &offset); + pl_mutex_unlock(&ma->lock); + if (!slab) { + PL_ERR(ma->vk, "No slab to serve request for %s bytes (with " + "alignment 0x%zx) in pool %d!", + PRINT_SIZE(size), align, pool->index); + return false; + } + + // For accounting, just treat the alignment as part of the used size. + // Doing it this way makes sure that the sizes reported to vk_memslice + // consumers are always aligned properly. + size = PL_ALIGN(size, align); + slab->used += size; + slab->age = ma->age; + if (params->debug_tag) + slab->debug_tag = params->debug_tag; + pl_mutex_unlock(&slab->lock); + } + + pl_assert(offset % align == 0); + *out = (struct vk_memslice) { + .vkmem = slab->mem, + .offset = offset, + .size = size, + .buf = slab->buffer, + .data = slab->data ? (uint8_t *) slab->data + offset : 0x0, + .coherent = slab->coherent, + .map_offset = slab->data ? offset : 0, + .map_size = slab->data ? size : 0, + .priv = slab, + .shared_mem = { + .handle = slab->handle, + .offset = offset, + .size = slab->size, + }, + }; + return true; +} |