/*
* This file is part of libplacebo.
*
* libplacebo is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* libplacebo is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with libplacebo. If not, see .
*/
#include "malloc.h"
#include "command.h"
#include "utils.h"
#include "pl_thread.h"
#ifdef PL_HAVE_UNIX
#include
#include
#endif
// Controls the page size alignment, to help coalesce allocations into the same
// slab. Pages are rounded up to multiples of this value. (Default: 4 KB)
#define PAGE_SIZE_ALIGN (1LLU << 12)
// Controls the minimum/maximum number of pages for new slabs. As slabs are
// exhausted of memory, the number of pages per new slab grows exponentially,
// starting with the minimum until the maximum is reached.
//
// Note: The maximum must never exceed the size of `vk_slab.spacemap`.
#define MINIMUM_PAGE_COUNT 4
#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8)
// Controls the maximum page size. Any allocations above this threshold
// (absolute size or fraction of VRAM, whichever is higher) will be served by
// dedicated allocations. (Default: 64 MB or 1/16 of VRAM)
#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26)
#define MAXIMUM_PAGE_SIZE_RELATIVE 16
// Controls the minimum slab size, to avoid excessive re-allocation of very
// small slabs. (Default: 256 KB)
#define MINIMUM_SLAB_SIZE (1LLU << 18)
// How long to wait before garbage collecting empty slabs. Slabs older than
// this many invocations of `vk_malloc_garbage_collect` will be released.
#define MAXIMUM_SLAB_AGE 32
// A single slab represents a contiguous region of allocated memory. Actual
// allocations are served as pages of this. Slabs are organized into pools,
// each of which contains a list of slabs of differing page sizes.
struct vk_slab {
pl_mutex lock;
pl_debug_tag debug_tag; // debug tag of the triggering allocation
VkDeviceMemory mem; // underlying device allocation
VkDeviceSize size; // total allocated size of `mem`
VkMemoryType mtype; // underlying memory type
bool dedicated; // slab is allocated specifically for one object
bool imported; // slab represents an imported memory allocation
// free space accounting (only for non-dedicated slabs)
uint64_t spacemap; // bitset of available pages
size_t pagesize; // size in bytes per page
size_t used; // number of bytes actually in use
uint64_t age; // timestamp of last use
// optional, depends on the memory type:
VkBuffer buffer; // buffer spanning the entire slab
void *data; // mapped memory corresponding to `mem`
bool coherent; // mapped memory is coherent
union pl_handle handle; // handle associated with this device memory
enum pl_handle_type handle_type;
};
// Represents a single memory pool. We keep track of a vk_pool for each
// combination of malloc parameters. This shouldn't actually be that many in
// practice, because some combinations simply never occur, and others will
// generally be the same for the same objects.
//
// Note: `vk_pool` addresses are not immutable, so we mustn't expose any
// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`.
struct vk_pool {
struct vk_malloc_params params; // allocation params (with some fields nulled)
PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted
int index; // running index in `vk_malloc.pools`
};
// The overall state of the allocator, which keeps track of a vk_pool for each
// memory type.
struct vk_malloc {
struct vk_ctx *vk;
pl_mutex lock;
VkPhysicalDeviceMemoryProperties props;
size_t maximum_page_size;
PL_ARRAY(struct vk_pool) pools;
uint64_t age;
};
static inline float efficiency(size_t used, size_t total)
{
if (!total)
return 100.0;
return 100.0f * used / total;
}
static const char *print_size(char buf[8], size_t size)
{
const char *suffixes = "\0KMG";
while (suffixes[1] && size > 9999) {
size >>= 10;
suffixes++;
}
int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes)
: snprintf(buf, 8, "%5zu", size);
return ret >= 0 ? buf : "(error)";
}
#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x)))
void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev)
{
struct vk_ctx *vk = ma->vk;
size_t total_size = 0;
size_t total_used = 0;
size_t total_res = 0;
PL_MSG(vk, lev, "Memory heaps supported by device:");
for (int i = 0; i < ma->props.memoryHeapCount; i++) {
VkMemoryHeap heap = ma->props.memoryHeaps[i];
PL_MSG(vk, lev, " %d: flags 0x%x size %s",
i, (unsigned) heap.flags, PRINT_SIZE(heap.size));
}
PL_DEBUG(vk, "Memory types supported by device:");
for (int i = 0; i < ma->props.memoryTypeCount; i++) {
VkMemoryType type = ma->props.memoryTypes[i];
PL_DEBUG(vk, " %d: flags 0x%x heap %d",
i, (unsigned) type.propertyFlags, (int) type.heapIndex);
}
pl_mutex_lock(&ma->lock);
for (int i = 0; i < ma->pools.num; i++) {
struct vk_pool *pool = &ma->pools.elem[i];
const struct vk_malloc_params *par = &pool->params;
PL_MSG(vk, lev, "Memory pool %d:", i);
PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits);
if (par->required)
PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required);
if (par->optimal)
PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal);
if (par->buf_usage)
PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage);
if (par->export_handle)
PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle);
size_t pool_size = 0;
size_t pool_used = 0;
size_t pool_res = 0;
for (int j = 0; j < pool->slabs.num; j++) {
struct vk_slab *slab = pool->slabs.elem[j];
pl_mutex_lock(&slab->lock);
size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize;
size_t slab_res = slab->size - avail;
PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: "
"%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]",
j, slab->spacemap, PRINT_SIZE(slab->pagesize),
PRINT_SIZE(slab->used), PRINT_SIZE(slab_res),
PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex,
efficiency(slab->used, slab_res),
PL_DEF(slab->debug_tag, "unknown"));
pool_size += slab->size;
pool_used += slab->used;
pool_res += slab_res;
pl_mutex_unlock(&slab->lock);
}
PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, "
"efficiency %.2f%%, utilization %.2f%%",
PRINT_SIZE(pool_used), PRINT_SIZE(pool_res),
PRINT_SIZE(pool_size), efficiency(pool_used, pool_res),
efficiency(pool_res, pool_size));
total_size += pool_size;
total_used += pool_used;
total_res += pool_res;
}
pl_mutex_unlock(&ma->lock);
PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, "
"efficiency %.2f%%, utilization %.2f%%, max page: %s",
PRINT_SIZE(total_used), PRINT_SIZE(total_res),
PRINT_SIZE(total_size), efficiency(total_used, total_res),
efficiency(total_res, total_size),
PRINT_SIZE(ma->maximum_page_size));
}
static void slab_free(struct vk_ctx *vk, struct vk_slab *slab)
{
if (!slab)
return;
#ifndef NDEBUG
if (!slab->dedicated && slab->used > 0) {
PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used);
PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64,
(size_t) slab->size, (int) slab->mtype.heapIndex,
(uint64_t) slab->mtype.propertyFlags);
if (slab->debug_tag)
PL_WARN(vk, "last used for: %s", slab->debug_tag);
pl_log_stack_trace(vk->log, PL_LOG_WARN);
pl_debug_abort();
}
#endif
if (slab->imported) {
switch (slab->handle_type) {
case PL_HANDLE_FD:
case PL_HANDLE_DMA_BUF:
PL_TRACE(vk, "Unimporting slab of size %s from fd: %d",
PRINT_SIZE(slab->size), slab->handle.fd);
break;
case PL_HANDLE_WIN32:
case PL_HANDLE_WIN32_KMT:
#ifdef PL_HAVE_WIN32
PL_TRACE(vk, "Unimporting slab of size %s from handle: %p",
PRINT_SIZE(slab->size), (void *) slab->handle.handle);
#endif
break;
case PL_HANDLE_HOST_PTR:
PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p",
PRINT_SIZE(slab->size), (void *) slab->handle.ptr);
break;
case PL_HANDLE_IOSURFACE:
case PL_HANDLE_MTL_TEX:
pl_unreachable();
}
} else {
switch (slab->handle_type) {
case PL_HANDLE_FD:
case PL_HANDLE_DMA_BUF:
#ifdef PL_HAVE_UNIX
if (slab->handle.fd > -1)
close(slab->handle.fd);
#endif
break;
case PL_HANDLE_WIN32:
#ifdef PL_HAVE_WIN32
if (slab->handle.handle != NULL)
CloseHandle(slab->handle.handle);
#endif
break;
case PL_HANDLE_WIN32_KMT:
// PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
break;
case PL_HANDLE_HOST_PTR:
// Implicitly unmapped
break;
case PL_HANDLE_IOSURFACE:
case PL_HANDLE_MTL_TEX:
pl_unreachable();
}
PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size));
}
vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC);
// also implicitly unmaps the memory if needed
vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC);
pl_mutex_destroy(&slab->lock);
pl_free(slab);
}
// type_mask: optional
// thread-safety: safe
static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask,
const struct vk_malloc_params *params,
uint32_t *out_index)
{
struct vk_ctx *vk = ma->vk;
int best = -1;
// The vulkan spec requires memory types to be sorted in the "optimal"
// order, so the first matching type we find will be the best/fastest one.
// That being said, we still want to prioritize memory types that have
// better optional flags.
type_mask &= params->reqs.memoryTypeBits;
for (int i = 0; i < ma->props.memoryTypeCount; i++) {
const VkMemoryType *mtype = &ma->props.memoryTypes[i];
// The memory type flags must include our properties
if ((mtype->propertyFlags & params->required) != params->required)
continue;
// The memory heap must be large enough for the allocation
VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size;
if (params->reqs.size > heapSize)
continue;
// The memory type must be supported by the type mask (bitfield)
if (!(type_mask & (1LU << i)))
continue;
// Calculate the score as the number of optimal property flags matched
int score = __builtin_popcountl(mtype->propertyFlags & params->optimal);
if (score > best) {
*out_index = i;
best = score;
}
}
if (best < 0) {
PL_ERR(vk, "Found no memory type matching property flags 0x%x and type "
"bits 0x%x!",
(unsigned) params->required, (unsigned) type_mask);
return false;
}
return true;
}
static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage,
enum pl_handle_type handle_type, bool import)
{
if (!handle_type)
return true;
VkPhysicalDeviceExternalBufferInfo info = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR,
.usage = usage,
.handleType = vk_mem_handle_type(handle_type),
};
VkExternalBufferProperties props = {
.sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR,
};
if (!info.handleType)
return false;
vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props);
return vk_external_mem_check(vk, &props.externalMemoryProperties,
handle_type, import);
}
// thread-safety: safe
static struct vk_slab *slab_alloc(struct vk_malloc *ma,
const struct vk_malloc_params *params)
{
struct vk_ctx *vk = ma->vk;
struct vk_slab *slab = pl_alloc_ptr(NULL, slab);
*slab = (struct vk_slab) {
.age = ma->age,
.size = params->reqs.size,
.handle_type = params->export_handle,
.debug_tag = params->debug_tag,
};
pl_mutex_init(&slab->lock);
switch (slab->handle_type) {
case PL_HANDLE_FD:
case PL_HANDLE_DMA_BUF:
slab->handle.fd = -1;
break;
case PL_HANDLE_WIN32:
case PL_HANDLE_WIN32_KMT:
case PL_HANDLE_MTL_TEX:
case PL_HANDLE_IOSURFACE:
slab->handle.handle = NULL;
break;
case PL_HANDLE_HOST_PTR:
slab->handle.ptr = NULL;
break;
}
VkExportMemoryAllocateInfoKHR ext_info = {
.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR,
.handleTypes = vk_mem_handle_type(slab->handle_type),
};
uint32_t type_mask = UINT32_MAX;
if (params->buf_usage) {
// Queue family sharing modes don't matter for buffers, so we just
// set them as concurrent and stop worrying about it.
uint32_t qfs[3] = {0};
pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
for (int i = 0; i < vk->pools.num; i++)
qfs[i] = vk->pools.elem[i]->qf;
VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
.handleTypes = ext_info.handleTypes,
};
VkBufferCreateInfo binfo = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = slab->handle_type ? &ext_buf_info : NULL,
.size = slab->size,
.usage = params->buf_usage,
.sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
: VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = vk->pools.num,
.pQueueFamilyIndices = qfs,
};
if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) {
PL_ERR(vk, "Failed allocating shared memory buffer: possibly "
"the handle type is unsupported?");
goto error;
}
VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer));
PL_VK_NAME(BUFFER, slab->buffer, "slab");
VkMemoryRequirements reqs = {0};
vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
slab->size = reqs.size; // this can be larger than `slab->size`
type_mask = reqs.memoryTypeBits;
// Note: we can ignore `reqs.align` because we always bind the buffer
// memory to offset 0
}
VkMemoryAllocateInfo minfo = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = slab->size,
};
if (params->export_handle)
vk_link_struct(&minfo, &ext_info);
VkMemoryDedicatedAllocateInfoKHR dinfo = {
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
.image = params->ded_image,
};
if (params->ded_image)
vk_link_struct(&minfo, &dinfo);
if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex))
goto error;
const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex];
PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s",
(size_t) slab->size, (unsigned) mtype->propertyFlags,
(int) minfo.memoryTypeIndex, (int) mtype->heapIndex,
PL_DEF(params->debug_tag, "unknown"));
pl_clock_t start = pl_clock_now();
VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem);
switch (res) {
case VK_ERROR_OUT_OF_DEVICE_MEMORY:
case VK_ERROR_OUT_OF_HOST_MEMORY:
PL_ERR(vk, "Allocation of size %s failed: %s!",
PRINT_SIZE(slab->size), vk_res_str(res));
vk_malloc_print_stats(ma, PL_LOG_ERR);
pl_log_stack_trace(vk->log, PL_LOG_ERR);
pl_debug_abort();
goto error;
default:
PL_VK_ASSERT(res, "vkAllocateMemory");
}
slab->mtype = *mtype;
if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
if (slab->buffer)
VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
#ifdef PL_HAVE_UNIX
if (slab->handle_type == PL_HANDLE_FD ||
slab->handle_type == PL_HANDLE_DMA_BUF)
{
VkMemoryGetFdInfoKHR fd_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
.memory = slab->mem,
.handleType = ext_info.handleTypes,
};
VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd));
}
#endif
#ifdef PL_HAVE_WIN32
if (slab->handle_type == PL_HANDLE_WIN32 ||
slab->handle_type == PL_HANDLE_WIN32_KMT)
{
VkMemoryGetWin32HandleInfoKHR handle_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
.memory = slab->mem,
.handleType = ext_info.handleTypes,
};
VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info,
&slab->handle.handle));
}
#endif
pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab");
// free space accounting is done by the caller
return slab;
error:
if (params->debug_tag)
PL_ERR(vk, " for malloc: %s", params->debug_tag);
slab_free(vk, slab);
return NULL;
}
static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool)
{
for (int i = 0; i < pool->slabs.num; i++)
slab_free(vk, pool->slabs.elem[i]);
pl_free(pool->slabs.elem);
*pool = (struct vk_pool) {0};
}
struct vk_malloc *vk_malloc_create(struct vk_ctx *vk)
{
struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma);
pl_mutex_init(&ma->lock);
vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props);
ma->vk = vk;
// Determine maximum page size
ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE;
for (int i = 0; i < ma->props.memoryHeapCount; i++) {
VkMemoryHeap heap = ma->props.memoryHeaps[i];
if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE;
ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max);
}
}
vk_malloc_print_stats(ma, PL_LOG_INFO);
return ma;
}
void vk_malloc_destroy(struct vk_malloc **ma_ptr)
{
struct vk_malloc *ma = *ma_ptr;
if (!ma)
return;
vk_malloc_print_stats(ma, PL_LOG_DEBUG);
for (int i = 0; i < ma->pools.num; i++)
pool_uninit(ma->vk, &ma->pools.elem[i]);
pl_mutex_destroy(&ma->lock);
pl_free_ptr(ma_ptr);
}
void vk_malloc_garbage_collect(struct vk_malloc *ma)
{
struct vk_ctx *vk = ma->vk;
pl_mutex_lock(&ma->lock);
ma->age++;
for (int i = 0; i < ma->pools.num; i++) {
struct vk_pool *pool = &ma->pools.elem[i];
for (int n = 0; n < pool->slabs.num; n++) {
struct vk_slab *slab = pool->slabs.elem[n];
pl_mutex_lock(&slab->lock);
if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) {
pl_mutex_unlock(&slab->lock);
continue;
}
PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d",
PRINT_SIZE(slab->size), pool->index);
pl_mutex_unlock(&slab->lock);
slab_free(ma->vk, slab);
PL_ARRAY_REMOVE_AT(pool->slabs, n--);
}
}
pl_mutex_unlock(&ma->lock);
}
pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import)
{
struct vk_ctx *vk = ma->vk;
pl_handle_caps caps = 0;
for (int i = 0; vk_mem_handle_list[i]; i++) {
// Try seeing if we could allocate a "basic" buffer using these
// capabilities, with no fancy buffer usage. More specific checks will
// happen down the line at VkBuffer creation time, but this should give
// us a rough idea of what the driver supports.
enum pl_handle_type type = vk_mem_handle_list[i];
if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import))
caps |= type;
}
return caps;
}
void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice)
{
struct vk_ctx *vk = ma->vk;
struct vk_slab *slab = slice->priv;
if (!slab || slab->dedicated) {
slab_free(vk, slab);
goto done;
}
pl_mutex_lock(&slab->lock);
int page_idx = slice->offset / slab->pagesize;
slab->spacemap |= 0x1LLU << page_idx;
slab->used -= slice->size;
slab->age = ma->age;
pl_assert(slab->used >= 0);
pl_mutex_unlock(&slab->lock);
done:
*slice = (struct vk_memslice) {0};
}
static inline bool pool_params_eq(const struct vk_malloc_params *a,
const struct vk_malloc_params *b)
{
return a->reqs.size == b->reqs.size &&
a->reqs.alignment == b->reqs.alignment &&
a->reqs.memoryTypeBits == b->reqs.memoryTypeBits &&
a->required == b->required &&
a->optimal == b->optimal &&
a->buf_usage == b->buf_usage &&
a->export_handle == b->export_handle;
}
static struct vk_pool *find_pool(struct vk_malloc *ma,
const struct vk_malloc_params *params)
{
pl_assert(!params->import_handle);
pl_assert(!params->ded_image);
struct vk_malloc_params fixed = *params;
fixed.reqs.alignment = 0;
fixed.reqs.size = 0;
fixed.shared_mem = (struct pl_shared_mem) {0};
for (int i = 0; i < ma->pools.num; i++) {
if (pool_params_eq(&ma->pools.elem[i].params, &fixed))
return &ma->pools.elem[i];
}
// Not found => add it
PL_ARRAY_GROW(ma, ma->pools);
size_t idx = ma->pools.num++;
ma->pools.elem[idx] = (struct vk_pool) {
.params = fixed,
.index = idx,
};
return &ma->pools.elem[idx];
}
// Returns a suitable memory page from the pool. A new slab will be allocated
// under the hood, if necessary.
//
// Note: This locks the slab it returns
static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool,
size_t size, size_t align,
VkDeviceSize *offset)
{
struct vk_slab *slab = NULL;
int slab_pages = MINIMUM_PAGE_COUNT;
size = PL_ALIGN2(size, PAGE_SIZE_ALIGN);
const size_t pagesize = PL_ALIGN(size, align);
for (int i = 0; i < pool->slabs.num; i++) {
slab = pool->slabs.elem[i];
if (slab->pagesize < size)
continue;
if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic
continue;
if (slab->pagesize % align)
continue;
pl_mutex_lock(&slab->lock);
int page_idx = __builtin_ffsll(slab->spacemap);
if (!page_idx--) {
pl_mutex_unlock(&slab->lock);
// Increase the number of slabs to allocate for new slabs the
// more existing full slabs exist for this size range
slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT);
continue;
}
slab->spacemap ^= 0x1LLU << page_idx;
*offset = page_idx * slab->pagesize;
return slab;
}
// Otherwise, allocate a new vk_slab and append it to the list.
VkDeviceSize slab_size = slab_pages * pagesize;
pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT);
const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT;
pl_assert(pagesize <= ma->maximum_page_size);
slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size);
slab_pages = slab_size / pagesize;
slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess
struct vk_malloc_params params = pool->params;
params.reqs.size = slab_size;
// Don't hold the lock while allocating the slab, because it can be a
// potentially very costly operation.
pl_mutex_unlock(&ma->lock);
slab = slab_alloc(ma, ¶ms);
pl_mutex_lock(&ma->lock);
if (!slab)
return NULL;
pl_mutex_lock(&slab->lock);
slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages);
slab->pagesize = pagesize;
PL_ARRAY_APPEND(NULL, pool->slabs, slab);
// Return the first page in this newly allocated slab
slab->spacemap ^= 0x1;
*offset = 0;
return slab;
}
static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out,
const struct vk_malloc_params *params)
{
struct vk_ctx *vk = ma->vk;
VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type;
vk_handle_type = vk_mem_handle_type(params->import_handle);
struct vk_slab *slab = NULL;
const struct pl_shared_mem *shmem = ¶ms->shared_mem;
VkMemoryDedicatedAllocateInfoKHR dinfo = {
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
.image = params->ded_image,
};
VkImportMemoryFdInfoKHR fdinfo = {
.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
.handleType = vk_handle_type,
.fd = -1,
};
VkImportMemoryHostPointerInfoEXT ptrinfo = {
.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
.handleType = vk_handle_type,
};
VkMemoryAllocateInfo ainfo = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = shmem->size,
};
if (params->ded_image)
vk_link_struct(&ainfo, &dinfo);
VkBuffer buffer = VK_NULL_HANDLE;
VkMemoryRequirements reqs = params->reqs;
if (params->buf_usage) {
uint32_t qfs[3] = {0};
pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
for (int i = 0; i < vk->pools.num; i++)
qfs[i] = vk->pools.elem[i]->qf;
VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
.handleTypes = vk_handle_type,
};
VkBufferCreateInfo binfo = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = &ext_buf_info,
.size = shmem->size,
.usage = params->buf_usage,
.sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
: VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = vk->pools.num,
.pQueueFamilyIndices = qfs,
};
VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer));
PL_VK_NAME(BUFFER, buffer, "imported");
vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs);
}
if (reqs.size > shmem->size) {
PL_ERR(vk, "Imported object requires %zu bytes, larger than the "
"provided size %zu!",
(size_t) reqs.size, shmem->size);
goto error;
}
if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) {
PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!",
shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment));
goto error;
}
switch (params->import_handle) {
#ifdef PL_HAVE_UNIX
case PL_HANDLE_DMA_BUF: {
if (!vk->GetMemoryFdPropertiesKHR) {
PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.",
VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME);
goto error;
}
VkMemoryFdPropertiesKHR fdprops = {
.sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
};
VK(vk->GetMemoryFdPropertiesKHR(vk->dev,
vk_handle_type,
shmem->handle.fd,
&fdprops));
// We dup() the fd to make it safe to import the same original fd
// multiple times.
fdinfo.fd = dup(shmem->handle.fd);
if (fdinfo.fd == -1) {
PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s",
fdinfo.fd, strerror(errno));
goto error;
}
reqs.memoryTypeBits &= fdprops.memoryTypeBits;
vk_link_struct(&ainfo, &fdinfo);
break;
}
#else // !PL_HAVE_UNIX
case PL_HANDLE_DMA_BUF:
PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!");
goto error;
#endif
case PL_HANDLE_HOST_PTR: {
VkMemoryHostPointerPropertiesEXT ptrprops = {
.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
};
VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type,
shmem->handle.ptr,
&ptrprops));
ptrinfo.pHostPointer = (void *) shmem->handle.ptr;
reqs.memoryTypeBits &= ptrprops.memoryTypeBits;
vk_link_struct(&ainfo, &ptrinfo);
break;
}
case PL_HANDLE_FD:
case PL_HANDLE_WIN32:
case PL_HANDLE_WIN32_KMT:
case PL_HANDLE_IOSURFACE:
case PL_HANDLE_MTL_TEX:
PL_ERR(vk, "vk_malloc_import: unsupported handle type %d",
params->import_handle);
goto error;
}
if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) {
PL_ERR(vk, "No compatible memory types offered for imported memory!");
goto error;
}
VkDeviceMemory vkmem = VK_NULL_HANDLE;
VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem));
slab = pl_alloc_ptr(NULL, slab);
*slab = (struct vk_slab) {
.mem = vkmem,
.dedicated = true,
.imported = true,
.buffer = buffer,
.size = shmem->size,
.handle_type = params->import_handle,
};
pl_mutex_init(&slab->lock);
*out = (struct vk_memslice) {
.vkmem = vkmem,
.buf = buffer,
.size = shmem->size - shmem->offset,
.offset = shmem->offset,
.shared_mem = *shmem,
.priv = slab,
};
switch (params->import_handle) {
case PL_HANDLE_DMA_BUF:
case PL_HANDLE_FD:
PL_TRACE(vk, "Imported %s bytes from fd: %d%s",
PRINT_SIZE(slab->size), shmem->handle.fd,
params->ded_image ? " (dedicated)" : "");
// fd ownership is transferred at this point.
slab->handle.fd = fdinfo.fd;
fdinfo.fd = -1;
break;
case PL_HANDLE_HOST_PTR:
PL_TRACE(vk, "Imported %s bytes from ptr: %p%s",
PRINT_SIZE(slab->size), shmem->handle.ptr,
params->ded_image ? " (dedicated" : "");
slab->handle.ptr = ptrinfo.pHostPointer;
break;
case PL_HANDLE_WIN32:
case PL_HANDLE_WIN32_KMT:
case PL_HANDLE_IOSURFACE:
case PL_HANDLE_MTL_TEX:
break;
}
VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags;
if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
out->data = (uint8_t *) slab->data + out->offset;
out->coherent = slab->coherent;
if (!slab->coherent) {
// Use entire buffer range, since this is a dedicated memory
// allocation. This avoids issues with noncoherent atomicity
out->map_offset = 0;
out->map_size = VK_WHOLE_SIZE;
// Mapping does not implicitly invalidate mapped memory
VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
.memory = slab->mem,
.offset = out->map_offset,
.size = out->map_size,
}));
}
}
if (buffer)
VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0));
return true;
error:
if (params->debug_tag)
PL_ERR(vk, " for malloc: %s", params->debug_tag);
vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC);
#ifdef PL_HAVE_UNIX
if (fdinfo.fd > -1)
close(fdinfo.fd);
#endif
pl_free(slab);
*out = (struct vk_memslice) {0};
return false;
}
size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags)
{
size_t avail = 0;
for (int i = 0; i < ma->props.memoryTypeCount; i++) {
const VkMemoryType *mtype = &ma->props.memoryTypes[i];
if ((mtype->propertyFlags & flags) != flags)
continue;
avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size);
}
return avail;
}
bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
const struct vk_malloc_params *params)
{
struct vk_ctx *vk = ma->vk;
pl_assert(!params->import_handle || !params->export_handle);
if (params->import_handle)
return vk_malloc_import(ma, out, params);
pl_assert(params->reqs.size);
size_t size = params->reqs.size;
size_t align = params->reqs.alignment;
align = pl_lcm(align, vk->props.limits.bufferImageGranularity);
align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize);
struct vk_slab *slab;
VkDeviceSize offset;
if (params->ded_image || size > ma->maximum_page_size) {
slab = slab_alloc(ma, params);
if (!slab)
return false;
slab->dedicated = true;
offset = 0;
} else {
pl_mutex_lock(&ma->lock);
struct vk_pool *pool = find_pool(ma, params);
slab = pool_get_page(ma, pool, size, align, &offset);
pl_mutex_unlock(&ma->lock);
if (!slab) {
PL_ERR(ma->vk, "No slab to serve request for %s bytes (with "
"alignment 0x%zx) in pool %d!",
PRINT_SIZE(size), align, pool->index);
return false;
}
// For accounting, just treat the alignment as part of the used size.
// Doing it this way makes sure that the sizes reported to vk_memslice
// consumers are always aligned properly.
size = PL_ALIGN(size, align);
slab->used += size;
slab->age = ma->age;
if (params->debug_tag)
slab->debug_tag = params->debug_tag;
pl_mutex_unlock(&slab->lock);
}
pl_assert(offset % align == 0);
*out = (struct vk_memslice) {
.vkmem = slab->mem,
.offset = offset,
.size = size,
.buf = slab->buffer,
.data = slab->data ? (uint8_t *) slab->data + offset : 0x0,
.coherent = slab->coherent,
.map_offset = slab->data ? offset : 0,
.map_size = slab->data ? size : 0,
.priv = slab,
.shared_mem = {
.handle = slab->handle,
.offset = offset,
.size = slab->size,
},
};
return true;
}