summaryrefslogtreecommitdiffstats
path: root/src/vulkan/malloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/vulkan/malloc.c')
-rw-r--r--src/vulkan/malloc.c1058
1 files changed, 1058 insertions, 0 deletions
diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c
new file mode 100644
index 0000000..c35183b
--- /dev/null
+++ b/src/vulkan/malloc.c
@@ -0,0 +1,1058 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "malloc.h"
+#include "command.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+#ifdef PL_HAVE_UNIX
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+// Controls the page size alignment, to help coalesce allocations into the same
+// slab. Pages are rounded up to multiples of this value. (Default: 4 KB)
+#define PAGE_SIZE_ALIGN (1LLU << 12)
+
+// Controls the minimum/maximum number of pages for new slabs. As slabs are
+// exhausted of memory, the number of pages per new slab grows exponentially,
+// starting with the minimum until the maximum is reached.
+//
+// Note: The maximum must never exceed the size of `vk_slab.spacemap`.
+#define MINIMUM_PAGE_COUNT 4
+#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8)
+
+// Controls the maximum page size. Any allocations above this threshold
+// (absolute size or fraction of VRAM, whichever is higher) will be served by
+// dedicated allocations. (Default: 64 MB or 1/16 of VRAM)
+#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26)
+#define MAXIMUM_PAGE_SIZE_RELATIVE 16
+
+// Controls the minimum slab size, to avoid excessive re-allocation of very
+// small slabs. (Default: 256 KB)
+#define MINIMUM_SLAB_SIZE (1LLU << 18)
+
+// How long to wait before garbage collecting empty slabs. Slabs older than
+// this many invocations of `vk_malloc_garbage_collect` will be released.
+#define MAXIMUM_SLAB_AGE 32
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as pages of this. Slabs are organized into pools,
+// each of which contains a list of slabs of differing page sizes.
+struct vk_slab {
+ pl_mutex lock;
+ pl_debug_tag debug_tag; // debug tag of the triggering allocation
+ VkDeviceMemory mem; // underlying device allocation
+ VkDeviceSize size; // total allocated size of `mem`
+ VkMemoryType mtype; // underlying memory type
+ bool dedicated; // slab is allocated specifically for one object
+ bool imported; // slab represents an imported memory allocation
+
+ // free space accounting (only for non-dedicated slabs)
+ uint64_t spacemap; // bitset of available pages
+ size_t pagesize; // size in bytes per page
+ size_t used; // number of bytes actually in use
+ uint64_t age; // timestamp of last use
+
+ // optional, depends on the memory type:
+ VkBuffer buffer; // buffer spanning the entire slab
+ void *data; // mapped memory corresponding to `mem`
+ bool coherent; // mapped memory is coherent
+ union pl_handle handle; // handle associated with this device memory
+ enum pl_handle_type handle_type;
+};
+
+// Represents a single memory pool. We keep track of a vk_pool for each
+// combination of malloc parameters. This shouldn't actually be that many in
+// practice, because some combinations simply never occur, and others will
+// generally be the same for the same objects.
+//
+// Note: `vk_pool` addresses are not immutable, so we mustn't expose any
+// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`.
+struct vk_pool {
+ struct vk_malloc_params params; // allocation params (with some fields nulled)
+ PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted
+ int index; // running index in `vk_malloc.pools`
+};
+
+// The overall state of the allocator, which keeps track of a vk_pool for each
+// memory type.
+struct vk_malloc {
+ struct vk_ctx *vk;
+ pl_mutex lock;
+ VkPhysicalDeviceMemoryProperties props;
+ size_t maximum_page_size;
+ PL_ARRAY(struct vk_pool) pools;
+ uint64_t age;
+};
+
+static inline float efficiency(size_t used, size_t total)
+{
+ if (!total)
+ return 100.0;
+
+ return 100.0f * used / total;
+}
+
+static const char *print_size(char buf[8], size_t size)
+{
+ const char *suffixes = "\0KMG";
+ while (suffixes[1] && size > 9999) {
+ size >>= 10;
+ suffixes++;
+ }
+
+ int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes)
+ : snprintf(buf, 8, "%5zu", size);
+
+ return ret >= 0 ? buf : "(error)";
+}
+
+#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x)))
+
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev)
+{
+ struct vk_ctx *vk = ma->vk;
+ size_t total_size = 0;
+ size_t total_used = 0;
+ size_t total_res = 0;
+
+ PL_MSG(vk, lev, "Memory heaps supported by device:");
+ for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+ VkMemoryHeap heap = ma->props.memoryHeaps[i];
+ PL_MSG(vk, lev, " %d: flags 0x%x size %s",
+ i, (unsigned) heap.flags, PRINT_SIZE(heap.size));
+ }
+
+ PL_DEBUG(vk, "Memory types supported by device:");
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ VkMemoryType type = ma->props.memoryTypes[i];
+ PL_DEBUG(vk, " %d: flags 0x%x heap %d",
+ i, (unsigned) type.propertyFlags, (int) type.heapIndex);
+ }
+
+ pl_mutex_lock(&ma->lock);
+ for (int i = 0; i < ma->pools.num; i++) {
+ struct vk_pool *pool = &ma->pools.elem[i];
+ const struct vk_malloc_params *par = &pool->params;
+
+ PL_MSG(vk, lev, "Memory pool %d:", i);
+ PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits);
+ if (par->required)
+ PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required);
+ if (par->optimal)
+ PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal);
+ if (par->buf_usage)
+ PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage);
+ if (par->export_handle)
+ PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle);
+
+ size_t pool_size = 0;
+ size_t pool_used = 0;
+ size_t pool_res = 0;
+
+ for (int j = 0; j < pool->slabs.num; j++) {
+ struct vk_slab *slab = pool->slabs.elem[j];
+ pl_mutex_lock(&slab->lock);
+
+ size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize;
+ size_t slab_res = slab->size - avail;
+
+ PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: "
+ "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]",
+ j, slab->spacemap, PRINT_SIZE(slab->pagesize),
+ PRINT_SIZE(slab->used), PRINT_SIZE(slab_res),
+ PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex,
+ efficiency(slab->used, slab_res),
+ PL_DEF(slab->debug_tag, "unknown"));
+
+ pool_size += slab->size;
+ pool_used += slab->used;
+ pool_res += slab_res;
+ pl_mutex_unlock(&slab->lock);
+ }
+
+ PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, "
+ "efficiency %.2f%%, utilization %.2f%%",
+ PRINT_SIZE(pool_used), PRINT_SIZE(pool_res),
+ PRINT_SIZE(pool_size), efficiency(pool_used, pool_res),
+ efficiency(pool_res, pool_size));
+
+ total_size += pool_size;
+ total_used += pool_used;
+ total_res += pool_res;
+ }
+ pl_mutex_unlock(&ma->lock);
+
+ PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, "
+ "efficiency %.2f%%, utilization %.2f%%, max page: %s",
+ PRINT_SIZE(total_used), PRINT_SIZE(total_res),
+ PRINT_SIZE(total_size), efficiency(total_used, total_res),
+ efficiency(total_res, total_size),
+ PRINT_SIZE(ma->maximum_page_size));
+}
+
+static void slab_free(struct vk_ctx *vk, struct vk_slab *slab)
+{
+ if (!slab)
+ return;
+
+#ifndef NDEBUG
+ if (!slab->dedicated && slab->used > 0) {
+ PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used);
+ PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64,
+ (size_t) slab->size, (int) slab->mtype.heapIndex,
+ (uint64_t) slab->mtype.propertyFlags);
+ if (slab->debug_tag)
+ PL_WARN(vk, "last used for: %s", slab->debug_tag);
+ pl_log_stack_trace(vk->log, PL_LOG_WARN);
+ pl_debug_abort();
+ }
+#endif
+
+ if (slab->imported) {
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+ PL_TRACE(vk, "Unimporting slab of size %s from fd: %d",
+ PRINT_SIZE(slab->size), slab->handle.fd);
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+#ifdef PL_HAVE_WIN32
+ PL_TRACE(vk, "Unimporting slab of size %s from handle: %p",
+ PRINT_SIZE(slab->size), (void *) slab->handle.handle);
+#endif
+ break;
+ case PL_HANDLE_HOST_PTR:
+ PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p",
+ PRINT_SIZE(slab->size), (void *) slab->handle.ptr);
+ break;
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ pl_unreachable();
+ }
+ } else {
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+#ifdef PL_HAVE_UNIX
+ if (slab->handle.fd > -1)
+ close(slab->handle.fd);
+#endif
+ break;
+ case PL_HANDLE_WIN32:
+#ifdef PL_HAVE_WIN32
+ if (slab->handle.handle != NULL)
+ CloseHandle(slab->handle.handle);
+#endif
+ break;
+ case PL_HANDLE_WIN32_KMT:
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+ break;
+ case PL_HANDLE_HOST_PTR:
+ // Implicitly unmapped
+ break;
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ pl_unreachable();
+ }
+
+ PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size));
+ }
+
+ vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC);
+ // also implicitly unmaps the memory if needed
+ vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC);
+
+ pl_mutex_destroy(&slab->lock);
+ pl_free(slab);
+}
+
+// type_mask: optional
+// thread-safety: safe
+static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask,
+ const struct vk_malloc_params *params,
+ uint32_t *out_index)
+{
+ struct vk_ctx *vk = ma->vk;
+ int best = -1;
+
+ // The vulkan spec requires memory types to be sorted in the "optimal"
+ // order, so the first matching type we find will be the best/fastest one.
+ // That being said, we still want to prioritize memory types that have
+ // better optional flags.
+
+ type_mask &= params->reqs.memoryTypeBits;
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+
+ // The memory type flags must include our properties
+ if ((mtype->propertyFlags & params->required) != params->required)
+ continue;
+
+ // The memory heap must be large enough for the allocation
+ VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size;
+ if (params->reqs.size > heapSize)
+ continue;
+
+ // The memory type must be supported by the type mask (bitfield)
+ if (!(type_mask & (1LU << i)))
+ continue;
+
+ // Calculate the score as the number of optimal property flags matched
+ int score = __builtin_popcountl(mtype->propertyFlags & params->optimal);
+ if (score > best) {
+ *out_index = i;
+ best = score;
+ }
+ }
+
+ if (best < 0) {
+ PL_ERR(vk, "Found no memory type matching property flags 0x%x and type "
+ "bits 0x%x!",
+ (unsigned) params->required, (unsigned) type_mask);
+ return false;
+ }
+
+ return true;
+}
+
+static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage,
+ enum pl_handle_type handle_type, bool import)
+{
+ if (!handle_type)
+ return true;
+
+ VkPhysicalDeviceExternalBufferInfo info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR,
+ .usage = usage,
+ .handleType = vk_mem_handle_type(handle_type),
+ };
+
+ VkExternalBufferProperties props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR,
+ };
+
+ if (!info.handleType)
+ return false;
+
+ vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props);
+ return vk_external_mem_check(vk, &props.externalMemoryProperties,
+ handle_type, import);
+}
+
+// thread-safety: safe
+static struct vk_slab *slab_alloc(struct vk_malloc *ma,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ struct vk_slab *slab = pl_alloc_ptr(NULL, slab);
+ *slab = (struct vk_slab) {
+ .age = ma->age,
+ .size = params->reqs.size,
+ .handle_type = params->export_handle,
+ .debug_tag = params->debug_tag,
+ };
+ pl_mutex_init(&slab->lock);
+
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+ slab->handle.fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ slab->handle.handle = NULL;
+ break;
+ case PL_HANDLE_HOST_PTR:
+ slab->handle.ptr = NULL;
+ break;
+ }
+
+ VkExportMemoryAllocateInfoKHR ext_info = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR,
+ .handleTypes = vk_mem_handle_type(slab->handle_type),
+ };
+
+ uint32_t type_mask = UINT32_MAX;
+ if (params->buf_usage) {
+ // Queue family sharing modes don't matter for buffers, so we just
+ // set them as concurrent and stop worrying about it.
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+ .handleTypes = ext_info.handleTypes,
+ };
+
+ VkBufferCreateInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = slab->handle_type ? &ext_buf_info : NULL,
+ .size = slab->size,
+ .usage = params->buf_usage,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) {
+ PL_ERR(vk, "Failed allocating shared memory buffer: possibly "
+ "the handle type is unsupported?");
+ goto error;
+ }
+
+ VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer));
+ PL_VK_NAME(BUFFER, slab->buffer, "slab");
+
+ VkMemoryRequirements reqs = {0};
+ vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+ slab->size = reqs.size; // this can be larger than `slab->size`
+ type_mask = reqs.memoryTypeBits;
+
+ // Note: we can ignore `reqs.align` because we always bind the buffer
+ // memory to offset 0
+ }
+
+ VkMemoryAllocateInfo minfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = slab->size,
+ };
+
+ if (params->export_handle)
+ vk_link_struct(&minfo, &ext_info);
+
+ VkMemoryDedicatedAllocateInfoKHR dinfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+ .image = params->ded_image,
+ };
+
+ if (params->ded_image)
+ vk_link_struct(&minfo, &dinfo);
+
+ if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex))
+ goto error;
+
+ const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex];
+ PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s",
+ (size_t) slab->size, (unsigned) mtype->propertyFlags,
+ (int) minfo.memoryTypeIndex, (int) mtype->heapIndex,
+ PL_DEF(params->debug_tag, "unknown"));
+
+ pl_clock_t start = pl_clock_now();
+
+ VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem);
+ switch (res) {
+ case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+ case VK_ERROR_OUT_OF_HOST_MEMORY:
+ PL_ERR(vk, "Allocation of size %s failed: %s!",
+ PRINT_SIZE(slab->size), vk_res_str(res));
+ vk_malloc_print_stats(ma, PL_LOG_ERR);
+ pl_log_stack_trace(vk->log, PL_LOG_ERR);
+ pl_debug_abort();
+ goto error;
+
+ default:
+ PL_VK_ASSERT(res, "vkAllocateMemory");
+ }
+
+ slab->mtype = *mtype;
+ if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+ slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ }
+
+ if (slab->buffer)
+ VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+#ifdef PL_HAVE_UNIX
+ if (slab->handle_type == PL_HANDLE_FD ||
+ slab->handle_type == PL_HANDLE_DMA_BUF)
+ {
+ VkMemoryGetFdInfoKHR fd_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+ .memory = slab->mem,
+ .handleType = ext_info.handleTypes,
+ };
+
+ VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (slab->handle_type == PL_HANDLE_WIN32 ||
+ slab->handle_type == PL_HANDLE_WIN32_KMT)
+ {
+ VkMemoryGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+ .memory = slab->mem,
+ .handleType = ext_info.handleTypes,
+ };
+
+ VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info,
+ &slab->handle.handle));
+ }
+#endif
+
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab");
+
+ // free space accounting is done by the caller
+ return slab;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(vk, " for malloc: %s", params->debug_tag);
+ slab_free(vk, slab);
+ return NULL;
+}
+
+static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool)
+{
+ for (int i = 0; i < pool->slabs.num; i++)
+ slab_free(vk, pool->slabs.elem[i]);
+
+ pl_free(pool->slabs.elem);
+ *pool = (struct vk_pool) {0};
+}
+
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk)
+{
+ struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma);
+ pl_mutex_init(&ma->lock);
+ vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props);
+ ma->vk = vk;
+
+ // Determine maximum page size
+ ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE;
+ for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+ VkMemoryHeap heap = ma->props.memoryHeaps[i];
+ if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+ size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE;
+ ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max);
+ }
+ }
+
+ vk_malloc_print_stats(ma, PL_LOG_INFO);
+ return ma;
+}
+
+void vk_malloc_destroy(struct vk_malloc **ma_ptr)
+{
+ struct vk_malloc *ma = *ma_ptr;
+ if (!ma)
+ return;
+
+ vk_malloc_print_stats(ma, PL_LOG_DEBUG);
+ for (int i = 0; i < ma->pools.num; i++)
+ pool_uninit(ma->vk, &ma->pools.elem[i]);
+
+ pl_mutex_destroy(&ma->lock);
+ pl_free_ptr(ma_ptr);
+}
+
+void vk_malloc_garbage_collect(struct vk_malloc *ma)
+{
+ struct vk_ctx *vk = ma->vk;
+
+ pl_mutex_lock(&ma->lock);
+ ma->age++;
+
+ for (int i = 0; i < ma->pools.num; i++) {
+ struct vk_pool *pool = &ma->pools.elem[i];
+ for (int n = 0; n < pool->slabs.num; n++) {
+ struct vk_slab *slab = pool->slabs.elem[n];
+ pl_mutex_lock(&slab->lock);
+ if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) {
+ pl_mutex_unlock(&slab->lock);
+ continue;
+ }
+
+ PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d",
+ PRINT_SIZE(slab->size), pool->index);
+
+ pl_mutex_unlock(&slab->lock);
+ slab_free(ma->vk, slab);
+ PL_ARRAY_REMOVE_AT(pool->slabs, n--);
+ }
+ }
+
+ pl_mutex_unlock(&ma->lock);
+}
+
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import)
+{
+ struct vk_ctx *vk = ma->vk;
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_mem_handle_list[i]; i++) {
+ // Try seeing if we could allocate a "basic" buffer using these
+ // capabilities, with no fancy buffer usage. More specific checks will
+ // happen down the line at VkBuffer creation time, but this should give
+ // us a rough idea of what the driver supports.
+ enum pl_handle_type type = vk_mem_handle_list[i];
+ if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import))
+ caps |= type;
+ }
+
+ return caps;
+}
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice)
+{
+ struct vk_ctx *vk = ma->vk;
+ struct vk_slab *slab = slice->priv;
+ if (!slab || slab->dedicated) {
+ slab_free(vk, slab);
+ goto done;
+ }
+
+ pl_mutex_lock(&slab->lock);
+
+ int page_idx = slice->offset / slab->pagesize;
+ slab->spacemap |= 0x1LLU << page_idx;
+ slab->used -= slice->size;
+ slab->age = ma->age;
+ pl_assert(slab->used >= 0);
+
+ pl_mutex_unlock(&slab->lock);
+
+done:
+ *slice = (struct vk_memslice) {0};
+}
+
+static inline bool pool_params_eq(const struct vk_malloc_params *a,
+ const struct vk_malloc_params *b)
+{
+ return a->reqs.size == b->reqs.size &&
+ a->reqs.alignment == b->reqs.alignment &&
+ a->reqs.memoryTypeBits == b->reqs.memoryTypeBits &&
+ a->required == b->required &&
+ a->optimal == b->optimal &&
+ a->buf_usage == b->buf_usage &&
+ a->export_handle == b->export_handle;
+}
+
+static struct vk_pool *find_pool(struct vk_malloc *ma,
+ const struct vk_malloc_params *params)
+{
+ pl_assert(!params->import_handle);
+ pl_assert(!params->ded_image);
+
+ struct vk_malloc_params fixed = *params;
+ fixed.reqs.alignment = 0;
+ fixed.reqs.size = 0;
+ fixed.shared_mem = (struct pl_shared_mem) {0};
+
+ for (int i = 0; i < ma->pools.num; i++) {
+ if (pool_params_eq(&ma->pools.elem[i].params, &fixed))
+ return &ma->pools.elem[i];
+ }
+
+ // Not found => add it
+ PL_ARRAY_GROW(ma, ma->pools);
+ size_t idx = ma->pools.num++;
+ ma->pools.elem[idx] = (struct vk_pool) {
+ .params = fixed,
+ .index = idx,
+ };
+ return &ma->pools.elem[idx];
+}
+
+// Returns a suitable memory page from the pool. A new slab will be allocated
+// under the hood, if necessary.
+//
+// Note: This locks the slab it returns
+static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool,
+ size_t size, size_t align,
+ VkDeviceSize *offset)
+{
+ struct vk_slab *slab = NULL;
+ int slab_pages = MINIMUM_PAGE_COUNT;
+ size = PL_ALIGN2(size, PAGE_SIZE_ALIGN);
+ const size_t pagesize = PL_ALIGN(size, align);
+
+ for (int i = 0; i < pool->slabs.num; i++) {
+ slab = pool->slabs.elem[i];
+ if (slab->pagesize < size)
+ continue;
+ if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic
+ continue;
+ if (slab->pagesize % align)
+ continue;
+
+ pl_mutex_lock(&slab->lock);
+ int page_idx = __builtin_ffsll(slab->spacemap);
+ if (!page_idx--) {
+ pl_mutex_unlock(&slab->lock);
+ // Increase the number of slabs to allocate for new slabs the
+ // more existing full slabs exist for this size range
+ slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT);
+ continue;
+ }
+
+ slab->spacemap ^= 0x1LLU << page_idx;
+ *offset = page_idx * slab->pagesize;
+ return slab;
+ }
+
+ // Otherwise, allocate a new vk_slab and append it to the list.
+ VkDeviceSize slab_size = slab_pages * pagesize;
+ pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT);
+ const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT;
+ pl_assert(pagesize <= ma->maximum_page_size);
+ slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size);
+ slab_pages = slab_size / pagesize;
+ slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess
+
+ struct vk_malloc_params params = pool->params;
+ params.reqs.size = slab_size;
+
+ // Don't hold the lock while allocating the slab, because it can be a
+ // potentially very costly operation.
+ pl_mutex_unlock(&ma->lock);
+ slab = slab_alloc(ma, &params);
+ pl_mutex_lock(&ma->lock);
+ if (!slab)
+ return NULL;
+ pl_mutex_lock(&slab->lock);
+
+ slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages);
+ slab->pagesize = pagesize;
+ PL_ARRAY_APPEND(NULL, pool->slabs, slab);
+
+ // Return the first page in this newly allocated slab
+ slab->spacemap ^= 0x1;
+ *offset = 0;
+ return slab;
+}
+
+static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type;
+ vk_handle_type = vk_mem_handle_type(params->import_handle);
+
+ struct vk_slab *slab = NULL;
+ const struct pl_shared_mem *shmem = &params->shared_mem;
+
+ VkMemoryDedicatedAllocateInfoKHR dinfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+ .image = params->ded_image,
+ };
+
+ VkImportMemoryFdInfoKHR fdinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+ .handleType = vk_handle_type,
+ .fd = -1,
+ };
+
+ VkImportMemoryHostPointerInfoEXT ptrinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+ .handleType = vk_handle_type,
+ };
+
+ VkMemoryAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = shmem->size,
+ };
+
+ if (params->ded_image)
+ vk_link_struct(&ainfo, &dinfo);
+
+ VkBuffer buffer = VK_NULL_HANDLE;
+ VkMemoryRequirements reqs = params->reqs;
+
+ if (params->buf_usage) {
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+ .handleTypes = vk_handle_type,
+ };
+
+ VkBufferCreateInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = &ext_buf_info,
+ .size = shmem->size,
+ .usage = params->buf_usage,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer));
+ PL_VK_NAME(BUFFER, buffer, "imported");
+
+ vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs);
+ }
+
+ if (reqs.size > shmem->size) {
+ PL_ERR(vk, "Imported object requires %zu bytes, larger than the "
+ "provided size %zu!",
+ (size_t) reqs.size, shmem->size);
+ goto error;
+ }
+
+ if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) {
+ PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!",
+ shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment));
+ goto error;
+ }
+
+ switch (params->import_handle) {
+#ifdef PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF: {
+ if (!vk->GetMemoryFdPropertiesKHR) {
+ PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.",
+ VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME);
+ goto error;
+ }
+
+ VkMemoryFdPropertiesKHR fdprops = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
+ };
+
+ VK(vk->GetMemoryFdPropertiesKHR(vk->dev,
+ vk_handle_type,
+ shmem->handle.fd,
+ &fdprops));
+
+ // We dup() the fd to make it safe to import the same original fd
+ // multiple times.
+ fdinfo.fd = dup(shmem->handle.fd);
+ if (fdinfo.fd == -1) {
+ PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s",
+ fdinfo.fd, strerror(errno));
+ goto error;
+ }
+
+ reqs.memoryTypeBits &= fdprops.memoryTypeBits;
+ vk_link_struct(&ainfo, &fdinfo);
+ break;
+ }
+#else // !PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF:
+ PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!");
+ goto error;
+#endif
+
+ case PL_HANDLE_HOST_PTR: {
+ VkMemoryHostPointerPropertiesEXT ptrprops = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
+ };
+
+ VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type,
+ shmem->handle.ptr,
+ &ptrprops));
+
+ ptrinfo.pHostPointer = (void *) shmem->handle.ptr;
+ reqs.memoryTypeBits &= ptrprops.memoryTypeBits;
+ vk_link_struct(&ainfo, &ptrinfo);
+ break;
+ }
+
+ case PL_HANDLE_FD:
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ PL_ERR(vk, "vk_malloc_import: unsupported handle type %d",
+ params->import_handle);
+ goto error;
+ }
+
+ if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) {
+ PL_ERR(vk, "No compatible memory types offered for imported memory!");
+ goto error;
+ }
+
+ VkDeviceMemory vkmem = VK_NULL_HANDLE;
+ VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem));
+
+ slab = pl_alloc_ptr(NULL, slab);
+ *slab = (struct vk_slab) {
+ .mem = vkmem,
+ .dedicated = true,
+ .imported = true,
+ .buffer = buffer,
+ .size = shmem->size,
+ .handle_type = params->import_handle,
+ };
+ pl_mutex_init(&slab->lock);
+
+ *out = (struct vk_memslice) {
+ .vkmem = vkmem,
+ .buf = buffer,
+ .size = shmem->size - shmem->offset,
+ .offset = shmem->offset,
+ .shared_mem = *shmem,
+ .priv = slab,
+ };
+
+ switch (params->import_handle) {
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_FD:
+ PL_TRACE(vk, "Imported %s bytes from fd: %d%s",
+ PRINT_SIZE(slab->size), shmem->handle.fd,
+ params->ded_image ? " (dedicated)" : "");
+ // fd ownership is transferred at this point.
+ slab->handle.fd = fdinfo.fd;
+ fdinfo.fd = -1;
+ break;
+ case PL_HANDLE_HOST_PTR:
+ PL_TRACE(vk, "Imported %s bytes from ptr: %p%s",
+ PRINT_SIZE(slab->size), shmem->handle.ptr,
+ params->ded_image ? " (dedicated" : "");
+ slab->handle.ptr = ptrinfo.pHostPointer;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ break;
+ }
+
+ VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags;
+ if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+ slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ out->data = (uint8_t *) slab->data + out->offset;
+ out->coherent = slab->coherent;
+ if (!slab->coherent) {
+ // Use entire buffer range, since this is a dedicated memory
+ // allocation. This avoids issues with noncoherent atomicity
+ out->map_offset = 0;
+ out->map_size = VK_WHOLE_SIZE;
+
+ // Mapping does not implicitly invalidate mapped memory
+ VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = slab->mem,
+ .offset = out->map_offset,
+ .size = out->map_size,
+ }));
+ }
+ }
+
+ if (buffer)
+ VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0));
+
+ return true;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(vk, " for malloc: %s", params->debug_tag);
+ vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC);
+#ifdef PL_HAVE_UNIX
+ if (fdinfo.fd > -1)
+ close(fdinfo.fd);
+#endif
+ pl_free(slab);
+ *out = (struct vk_memslice) {0};
+ return false;
+}
+
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags)
+{
+ size_t avail = 0;
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+ if ((mtype->propertyFlags & flags) != flags)
+ continue;
+ avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size);
+ }
+
+ return avail;
+}
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ pl_assert(!params->import_handle || !params->export_handle);
+ if (params->import_handle)
+ return vk_malloc_import(ma, out, params);
+
+ pl_assert(params->reqs.size);
+ size_t size = params->reqs.size;
+ size_t align = params->reqs.alignment;
+ align = pl_lcm(align, vk->props.limits.bufferImageGranularity);
+ align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize);
+
+ struct vk_slab *slab;
+ VkDeviceSize offset;
+
+ if (params->ded_image || size > ma->maximum_page_size) {
+ slab = slab_alloc(ma, params);
+ if (!slab)
+ return false;
+ slab->dedicated = true;
+ offset = 0;
+ } else {
+ pl_mutex_lock(&ma->lock);
+ struct vk_pool *pool = find_pool(ma, params);
+ slab = pool_get_page(ma, pool, size, align, &offset);
+ pl_mutex_unlock(&ma->lock);
+ if (!slab) {
+ PL_ERR(ma->vk, "No slab to serve request for %s bytes (with "
+ "alignment 0x%zx) in pool %d!",
+ PRINT_SIZE(size), align, pool->index);
+ return false;
+ }
+
+ // For accounting, just treat the alignment as part of the used size.
+ // Doing it this way makes sure that the sizes reported to vk_memslice
+ // consumers are always aligned properly.
+ size = PL_ALIGN(size, align);
+ slab->used += size;
+ slab->age = ma->age;
+ if (params->debug_tag)
+ slab->debug_tag = params->debug_tag;
+ pl_mutex_unlock(&slab->lock);
+ }
+
+ pl_assert(offset % align == 0);
+ *out = (struct vk_memslice) {
+ .vkmem = slab->mem,
+ .offset = offset,
+ .size = size,
+ .buf = slab->buffer,
+ .data = slab->data ? (uint8_t *) slab->data + offset : 0x0,
+ .coherent = slab->coherent,
+ .map_offset = slab->data ? offset : 0,
+ .map_size = slab->data ? size : 0,
+ .priv = slab,
+ .shared_mem = {
+ .handle = slab->handle,
+ .offset = offset,
+ .size = slab->size,
+ },
+ };
+ return true;
+}