/* * This file is part of libplacebo. * * libplacebo is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * libplacebo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with libplacebo. If not, see . */ #include "malloc.h" #include "command.h" #include "utils.h" #include "pl_thread.h" #ifdef PL_HAVE_UNIX #include #include #endif // Controls the page size alignment, to help coalesce allocations into the same // slab. Pages are rounded up to multiples of this value. (Default: 4 KB) #define PAGE_SIZE_ALIGN (1LLU << 12) // Controls the minimum/maximum number of pages for new slabs. As slabs are // exhausted of memory, the number of pages per new slab grows exponentially, // starting with the minimum until the maximum is reached. // // Note: The maximum must never exceed the size of `vk_slab.spacemap`. #define MINIMUM_PAGE_COUNT 4 #define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8) // Controls the maximum page size. Any allocations above this threshold // (absolute size or fraction of VRAM, whichever is higher) will be served by // dedicated allocations. (Default: 64 MB or 1/16 of VRAM) #define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26) #define MAXIMUM_PAGE_SIZE_RELATIVE 16 // Controls the minimum slab size, to avoid excessive re-allocation of very // small slabs. (Default: 256 KB) #define MINIMUM_SLAB_SIZE (1LLU << 18) // How long to wait before garbage collecting empty slabs. Slabs older than // this many invocations of `vk_malloc_garbage_collect` will be released. #define MAXIMUM_SLAB_AGE 32 // A single slab represents a contiguous region of allocated memory. Actual // allocations are served as pages of this. Slabs are organized into pools, // each of which contains a list of slabs of differing page sizes. struct vk_slab { pl_mutex lock; pl_debug_tag debug_tag; // debug tag of the triggering allocation VkDeviceMemory mem; // underlying device allocation VkDeviceSize size; // total allocated size of `mem` VkMemoryType mtype; // underlying memory type bool dedicated; // slab is allocated specifically for one object bool imported; // slab represents an imported memory allocation // free space accounting (only for non-dedicated slabs) uint64_t spacemap; // bitset of available pages size_t pagesize; // size in bytes per page size_t used; // number of bytes actually in use uint64_t age; // timestamp of last use // optional, depends on the memory type: VkBuffer buffer; // buffer spanning the entire slab void *data; // mapped memory corresponding to `mem` bool coherent; // mapped memory is coherent union pl_handle handle; // handle associated with this device memory enum pl_handle_type handle_type; }; // Represents a single memory pool. We keep track of a vk_pool for each // combination of malloc parameters. This shouldn't actually be that many in // practice, because some combinations simply never occur, and others will // generally be the same for the same objects. // // Note: `vk_pool` addresses are not immutable, so we mustn't expose any // dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`. struct vk_pool { struct vk_malloc_params params; // allocation params (with some fields nulled) PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted int index; // running index in `vk_malloc.pools` }; // The overall state of the allocator, which keeps track of a vk_pool for each // memory type. struct vk_malloc { struct vk_ctx *vk; pl_mutex lock; VkPhysicalDeviceMemoryProperties props; size_t maximum_page_size; PL_ARRAY(struct vk_pool) pools; uint64_t age; }; static inline float efficiency(size_t used, size_t total) { if (!total) return 100.0; return 100.0f * used / total; } static const char *print_size(char buf[8], size_t size) { const char *suffixes = "\0KMG"; while (suffixes[1] && size > 9999) { size >>= 10; suffixes++; } int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes) : snprintf(buf, 8, "%5zu", size); return ret >= 0 ? buf : "(error)"; } #define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x))) void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev) { struct vk_ctx *vk = ma->vk; size_t total_size = 0; size_t total_used = 0; size_t total_res = 0; PL_MSG(vk, lev, "Memory heaps supported by device:"); for (int i = 0; i < ma->props.memoryHeapCount; i++) { VkMemoryHeap heap = ma->props.memoryHeaps[i]; PL_MSG(vk, lev, " %d: flags 0x%x size %s", i, (unsigned) heap.flags, PRINT_SIZE(heap.size)); } PL_DEBUG(vk, "Memory types supported by device:"); for (int i = 0; i < ma->props.memoryTypeCount; i++) { VkMemoryType type = ma->props.memoryTypes[i]; PL_DEBUG(vk, " %d: flags 0x%x heap %d", i, (unsigned) type.propertyFlags, (int) type.heapIndex); } pl_mutex_lock(&ma->lock); for (int i = 0; i < ma->pools.num; i++) { struct vk_pool *pool = &ma->pools.elem[i]; const struct vk_malloc_params *par = &pool->params; PL_MSG(vk, lev, "Memory pool %d:", i); PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits); if (par->required) PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required); if (par->optimal) PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal); if (par->buf_usage) PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage); if (par->export_handle) PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle); size_t pool_size = 0; size_t pool_used = 0; size_t pool_res = 0; for (int j = 0; j < pool->slabs.num; j++) { struct vk_slab *slab = pool->slabs.elem[j]; pl_mutex_lock(&slab->lock); size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize; size_t slab_res = slab->size - avail; PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: " "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]", j, slab->spacemap, PRINT_SIZE(slab->pagesize), PRINT_SIZE(slab->used), PRINT_SIZE(slab_res), PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex, efficiency(slab->used, slab_res), PL_DEF(slab->debug_tag, "unknown")); pool_size += slab->size; pool_used += slab->used; pool_res += slab_res; pl_mutex_unlock(&slab->lock); } PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, " "efficiency %.2f%%, utilization %.2f%%", PRINT_SIZE(pool_used), PRINT_SIZE(pool_res), PRINT_SIZE(pool_size), efficiency(pool_used, pool_res), efficiency(pool_res, pool_size)); total_size += pool_size; total_used += pool_used; total_res += pool_res; } pl_mutex_unlock(&ma->lock); PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, " "efficiency %.2f%%, utilization %.2f%%, max page: %s", PRINT_SIZE(total_used), PRINT_SIZE(total_res), PRINT_SIZE(total_size), efficiency(total_used, total_res), efficiency(total_res, total_size), PRINT_SIZE(ma->maximum_page_size)); } static void slab_free(struct vk_ctx *vk, struct vk_slab *slab) { if (!slab) return; #ifndef NDEBUG if (!slab->dedicated && slab->used > 0) { PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used); PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64, (size_t) slab->size, (int) slab->mtype.heapIndex, (uint64_t) slab->mtype.propertyFlags); if (slab->debug_tag) PL_WARN(vk, "last used for: %s", slab->debug_tag); pl_log_stack_trace(vk->log, PL_LOG_WARN); pl_debug_abort(); } #endif if (slab->imported) { switch (slab->handle_type) { case PL_HANDLE_FD: case PL_HANDLE_DMA_BUF: PL_TRACE(vk, "Unimporting slab of size %s from fd: %d", PRINT_SIZE(slab->size), slab->handle.fd); break; case PL_HANDLE_WIN32: case PL_HANDLE_WIN32_KMT: #ifdef PL_HAVE_WIN32 PL_TRACE(vk, "Unimporting slab of size %s from handle: %p", PRINT_SIZE(slab->size), (void *) slab->handle.handle); #endif break; case PL_HANDLE_HOST_PTR: PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p", PRINT_SIZE(slab->size), (void *) slab->handle.ptr); break; case PL_HANDLE_IOSURFACE: case PL_HANDLE_MTL_TEX: pl_unreachable(); } } else { switch (slab->handle_type) { case PL_HANDLE_FD: case PL_HANDLE_DMA_BUF: #ifdef PL_HAVE_UNIX if (slab->handle.fd > -1) close(slab->handle.fd); #endif break; case PL_HANDLE_WIN32: #ifdef PL_HAVE_WIN32 if (slab->handle.handle != NULL) CloseHandle(slab->handle.handle); #endif break; case PL_HANDLE_WIN32_KMT: // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. break; case PL_HANDLE_HOST_PTR: // Implicitly unmapped break; case PL_HANDLE_IOSURFACE: case PL_HANDLE_MTL_TEX: pl_unreachable(); } PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size)); } vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC); // also implicitly unmaps the memory if needed vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC); pl_mutex_destroy(&slab->lock); pl_free(slab); } // type_mask: optional // thread-safety: safe static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask, const struct vk_malloc_params *params, uint32_t *out_index) { struct vk_ctx *vk = ma->vk; int best = -1; // The vulkan spec requires memory types to be sorted in the "optimal" // order, so the first matching type we find will be the best/fastest one. // That being said, we still want to prioritize memory types that have // better optional flags. type_mask &= params->reqs.memoryTypeBits; for (int i = 0; i < ma->props.memoryTypeCount; i++) { const VkMemoryType *mtype = &ma->props.memoryTypes[i]; // The memory type flags must include our properties if ((mtype->propertyFlags & params->required) != params->required) continue; // The memory heap must be large enough for the allocation VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size; if (params->reqs.size > heapSize) continue; // The memory type must be supported by the type mask (bitfield) if (!(type_mask & (1LU << i))) continue; // Calculate the score as the number of optimal property flags matched int score = __builtin_popcountl(mtype->propertyFlags & params->optimal); if (score > best) { *out_index = i; best = score; } } if (best < 0) { PL_ERR(vk, "Found no memory type matching property flags 0x%x and type " "bits 0x%x!", (unsigned) params->required, (unsigned) type_mask); return false; } return true; } static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage, enum pl_handle_type handle_type, bool import) { if (!handle_type) return true; VkPhysicalDeviceExternalBufferInfo info = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR, .usage = usage, .handleType = vk_mem_handle_type(handle_type), }; VkExternalBufferProperties props = { .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR, }; if (!info.handleType) return false; vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props); return vk_external_mem_check(vk, &props.externalMemoryProperties, handle_type, import); } // thread-safety: safe static struct vk_slab *slab_alloc(struct vk_malloc *ma, const struct vk_malloc_params *params) { struct vk_ctx *vk = ma->vk; struct vk_slab *slab = pl_alloc_ptr(NULL, slab); *slab = (struct vk_slab) { .age = ma->age, .size = params->reqs.size, .handle_type = params->export_handle, .debug_tag = params->debug_tag, }; pl_mutex_init(&slab->lock); switch (slab->handle_type) { case PL_HANDLE_FD: case PL_HANDLE_DMA_BUF: slab->handle.fd = -1; break; case PL_HANDLE_WIN32: case PL_HANDLE_WIN32_KMT: case PL_HANDLE_MTL_TEX: case PL_HANDLE_IOSURFACE: slab->handle.handle = NULL; break; case PL_HANDLE_HOST_PTR: slab->handle.ptr = NULL; break; } VkExportMemoryAllocateInfoKHR ext_info = { .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR, .handleTypes = vk_mem_handle_type(slab->handle_type), }; uint32_t type_mask = UINT32_MAX; if (params->buf_usage) { // Queue family sharing modes don't matter for buffers, so we just // set them as concurrent and stop worrying about it. uint32_t qfs[3] = {0}; pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); for (int i = 0; i < vk->pools.num; i++) qfs[i] = vk->pools.elem[i]->qf; VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, .handleTypes = ext_info.handleTypes, }; VkBufferCreateInfo binfo = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = slab->handle_type ? &ext_buf_info : NULL, .size = slab->size, .usage = params->buf_usage, .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = vk->pools.num, .pQueueFamilyIndices = qfs, }; if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) { PL_ERR(vk, "Failed allocating shared memory buffer: possibly " "the handle type is unsupported?"); goto error; } VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer)); PL_VK_NAME(BUFFER, slab->buffer, "slab"); VkMemoryRequirements reqs = {0}; vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); slab->size = reqs.size; // this can be larger than `slab->size` type_mask = reqs.memoryTypeBits; // Note: we can ignore `reqs.align` because we always bind the buffer // memory to offset 0 } VkMemoryAllocateInfo minfo = { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .allocationSize = slab->size, }; if (params->export_handle) vk_link_struct(&minfo, &ext_info); VkMemoryDedicatedAllocateInfoKHR dinfo = { .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, .image = params->ded_image, }; if (params->ded_image) vk_link_struct(&minfo, &dinfo); if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex)) goto error; const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex]; PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s", (size_t) slab->size, (unsigned) mtype->propertyFlags, (int) minfo.memoryTypeIndex, (int) mtype->heapIndex, PL_DEF(params->debug_tag, "unknown")); pl_clock_t start = pl_clock_now(); VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem); switch (res) { case VK_ERROR_OUT_OF_DEVICE_MEMORY: case VK_ERROR_OUT_OF_HOST_MEMORY: PL_ERR(vk, "Allocation of size %s failed: %s!", PRINT_SIZE(slab->size), vk_res_str(res)); vk_malloc_print_stats(ma, PL_LOG_ERR); pl_log_stack_trace(vk->log, PL_LOG_ERR); pl_debug_abort(); goto error; default: PL_VK_ASSERT(res, "vkAllocateMemory"); } slab->mtype = *mtype; if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } if (slab->buffer) VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); #ifdef PL_HAVE_UNIX if (slab->handle_type == PL_HANDLE_FD || slab->handle_type == PL_HANDLE_DMA_BUF) { VkMemoryGetFdInfoKHR fd_info = { .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, .memory = slab->mem, .handleType = ext_info.handleTypes, }; VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd)); } #endif #ifdef PL_HAVE_WIN32 if (slab->handle_type == PL_HANDLE_WIN32 || slab->handle_type == PL_HANDLE_WIN32_KMT) { VkMemoryGetWin32HandleInfoKHR handle_info = { .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, .memory = slab->mem, .handleType = ext_info.handleTypes, }; VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info, &slab->handle.handle)); } #endif pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab"); // free space accounting is done by the caller return slab; error: if (params->debug_tag) PL_ERR(vk, " for malloc: %s", params->debug_tag); slab_free(vk, slab); return NULL; } static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool) { for (int i = 0; i < pool->slabs.num; i++) slab_free(vk, pool->slabs.elem[i]); pl_free(pool->slabs.elem); *pool = (struct vk_pool) {0}; } struct vk_malloc *vk_malloc_create(struct vk_ctx *vk) { struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma); pl_mutex_init(&ma->lock); vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props); ma->vk = vk; // Determine maximum page size ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE; for (int i = 0; i < ma->props.memoryHeapCount; i++) { VkMemoryHeap heap = ma->props.memoryHeaps[i]; if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE; ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max); } } vk_malloc_print_stats(ma, PL_LOG_INFO); return ma; } void vk_malloc_destroy(struct vk_malloc **ma_ptr) { struct vk_malloc *ma = *ma_ptr; if (!ma) return; vk_malloc_print_stats(ma, PL_LOG_DEBUG); for (int i = 0; i < ma->pools.num; i++) pool_uninit(ma->vk, &ma->pools.elem[i]); pl_mutex_destroy(&ma->lock); pl_free_ptr(ma_ptr); } void vk_malloc_garbage_collect(struct vk_malloc *ma) { struct vk_ctx *vk = ma->vk; pl_mutex_lock(&ma->lock); ma->age++; for (int i = 0; i < ma->pools.num; i++) { struct vk_pool *pool = &ma->pools.elem[i]; for (int n = 0; n < pool->slabs.num; n++) { struct vk_slab *slab = pool->slabs.elem[n]; pl_mutex_lock(&slab->lock); if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) { pl_mutex_unlock(&slab->lock); continue; } PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d", PRINT_SIZE(slab->size), pool->index); pl_mutex_unlock(&slab->lock); slab_free(ma->vk, slab); PL_ARRAY_REMOVE_AT(pool->slabs, n--); } } pl_mutex_unlock(&ma->lock); } pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import) { struct vk_ctx *vk = ma->vk; pl_handle_caps caps = 0; for (int i = 0; vk_mem_handle_list[i]; i++) { // Try seeing if we could allocate a "basic" buffer using these // capabilities, with no fancy buffer usage. More specific checks will // happen down the line at VkBuffer creation time, but this should give // us a rough idea of what the driver supports. enum pl_handle_type type = vk_mem_handle_list[i]; if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import)) caps |= type; } return caps; } void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice) { struct vk_ctx *vk = ma->vk; struct vk_slab *slab = slice->priv; if (!slab || slab->dedicated) { slab_free(vk, slab); goto done; } pl_mutex_lock(&slab->lock); int page_idx = slice->offset / slab->pagesize; slab->spacemap |= 0x1LLU << page_idx; slab->used -= slice->size; slab->age = ma->age; pl_assert(slab->used >= 0); pl_mutex_unlock(&slab->lock); done: *slice = (struct vk_memslice) {0}; } static inline bool pool_params_eq(const struct vk_malloc_params *a, const struct vk_malloc_params *b) { return a->reqs.size == b->reqs.size && a->reqs.alignment == b->reqs.alignment && a->reqs.memoryTypeBits == b->reqs.memoryTypeBits && a->required == b->required && a->optimal == b->optimal && a->buf_usage == b->buf_usage && a->export_handle == b->export_handle; } static struct vk_pool *find_pool(struct vk_malloc *ma, const struct vk_malloc_params *params) { pl_assert(!params->import_handle); pl_assert(!params->ded_image); struct vk_malloc_params fixed = *params; fixed.reqs.alignment = 0; fixed.reqs.size = 0; fixed.shared_mem = (struct pl_shared_mem) {0}; for (int i = 0; i < ma->pools.num; i++) { if (pool_params_eq(&ma->pools.elem[i].params, &fixed)) return &ma->pools.elem[i]; } // Not found => add it PL_ARRAY_GROW(ma, ma->pools); size_t idx = ma->pools.num++; ma->pools.elem[idx] = (struct vk_pool) { .params = fixed, .index = idx, }; return &ma->pools.elem[idx]; } // Returns a suitable memory page from the pool. A new slab will be allocated // under the hood, if necessary. // // Note: This locks the slab it returns static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool, size_t size, size_t align, VkDeviceSize *offset) { struct vk_slab *slab = NULL; int slab_pages = MINIMUM_PAGE_COUNT; size = PL_ALIGN2(size, PAGE_SIZE_ALIGN); const size_t pagesize = PL_ALIGN(size, align); for (int i = 0; i < pool->slabs.num; i++) { slab = pool->slabs.elem[i]; if (slab->pagesize < size) continue; if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic continue; if (slab->pagesize % align) continue; pl_mutex_lock(&slab->lock); int page_idx = __builtin_ffsll(slab->spacemap); if (!page_idx--) { pl_mutex_unlock(&slab->lock); // Increase the number of slabs to allocate for new slabs the // more existing full slabs exist for this size range slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT); continue; } slab->spacemap ^= 0x1LLU << page_idx; *offset = page_idx * slab->pagesize; return slab; } // Otherwise, allocate a new vk_slab and append it to the list. VkDeviceSize slab_size = slab_pages * pagesize; pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT); const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT; pl_assert(pagesize <= ma->maximum_page_size); slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size); slab_pages = slab_size / pagesize; slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess struct vk_malloc_params params = pool->params; params.reqs.size = slab_size; // Don't hold the lock while allocating the slab, because it can be a // potentially very costly operation. pl_mutex_unlock(&ma->lock); slab = slab_alloc(ma, ¶ms); pl_mutex_lock(&ma->lock); if (!slab) return NULL; pl_mutex_lock(&slab->lock); slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages); slab->pagesize = pagesize; PL_ARRAY_APPEND(NULL, pool->slabs, slab); // Return the first page in this newly allocated slab slab->spacemap ^= 0x1; *offset = 0; return slab; } static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out, const struct vk_malloc_params *params) { struct vk_ctx *vk = ma->vk; VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type; vk_handle_type = vk_mem_handle_type(params->import_handle); struct vk_slab *slab = NULL; const struct pl_shared_mem *shmem = ¶ms->shared_mem; VkMemoryDedicatedAllocateInfoKHR dinfo = { .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, .image = params->ded_image, }; VkImportMemoryFdInfoKHR fdinfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, .handleType = vk_handle_type, .fd = -1, }; VkImportMemoryHostPointerInfoEXT ptrinfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT, .handleType = vk_handle_type, }; VkMemoryAllocateInfo ainfo = { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .allocationSize = shmem->size, }; if (params->ded_image) vk_link_struct(&ainfo, &dinfo); VkBuffer buffer = VK_NULL_HANDLE; VkMemoryRequirements reqs = params->reqs; if (params->buf_usage) { uint32_t qfs[3] = {0}; pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); for (int i = 0; i < vk->pools.num; i++) qfs[i] = vk->pools.elem[i]->qf; VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, .handleTypes = vk_handle_type, }; VkBufferCreateInfo binfo = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = &ext_buf_info, .size = shmem->size, .usage = params->buf_usage, .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = vk->pools.num, .pQueueFamilyIndices = qfs, }; VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer)); PL_VK_NAME(BUFFER, buffer, "imported"); vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs); } if (reqs.size > shmem->size) { PL_ERR(vk, "Imported object requires %zu bytes, larger than the " "provided size %zu!", (size_t) reqs.size, shmem->size); goto error; } if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) { PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!", shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment)); goto error; } switch (params->import_handle) { #ifdef PL_HAVE_UNIX case PL_HANDLE_DMA_BUF: { if (!vk->GetMemoryFdPropertiesKHR) { PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.", VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME); goto error; } VkMemoryFdPropertiesKHR fdprops = { .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR, }; VK(vk->GetMemoryFdPropertiesKHR(vk->dev, vk_handle_type, shmem->handle.fd, &fdprops)); // We dup() the fd to make it safe to import the same original fd // multiple times. fdinfo.fd = dup(shmem->handle.fd); if (fdinfo.fd == -1) { PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s", fdinfo.fd, strerror(errno)); goto error; } reqs.memoryTypeBits &= fdprops.memoryTypeBits; vk_link_struct(&ainfo, &fdinfo); break; } #else // !PL_HAVE_UNIX case PL_HANDLE_DMA_BUF: PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!"); goto error; #endif case PL_HANDLE_HOST_PTR: { VkMemoryHostPointerPropertiesEXT ptrprops = { .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT, }; VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type, shmem->handle.ptr, &ptrprops)); ptrinfo.pHostPointer = (void *) shmem->handle.ptr; reqs.memoryTypeBits &= ptrprops.memoryTypeBits; vk_link_struct(&ainfo, &ptrinfo); break; } case PL_HANDLE_FD: case PL_HANDLE_WIN32: case PL_HANDLE_WIN32_KMT: case PL_HANDLE_IOSURFACE: case PL_HANDLE_MTL_TEX: PL_ERR(vk, "vk_malloc_import: unsupported handle type %d", params->import_handle); goto error; } if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) { PL_ERR(vk, "No compatible memory types offered for imported memory!"); goto error; } VkDeviceMemory vkmem = VK_NULL_HANDLE; VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem)); slab = pl_alloc_ptr(NULL, slab); *slab = (struct vk_slab) { .mem = vkmem, .dedicated = true, .imported = true, .buffer = buffer, .size = shmem->size, .handle_type = params->import_handle, }; pl_mutex_init(&slab->lock); *out = (struct vk_memslice) { .vkmem = vkmem, .buf = buffer, .size = shmem->size - shmem->offset, .offset = shmem->offset, .shared_mem = *shmem, .priv = slab, }; switch (params->import_handle) { case PL_HANDLE_DMA_BUF: case PL_HANDLE_FD: PL_TRACE(vk, "Imported %s bytes from fd: %d%s", PRINT_SIZE(slab->size), shmem->handle.fd, params->ded_image ? " (dedicated)" : ""); // fd ownership is transferred at this point. slab->handle.fd = fdinfo.fd; fdinfo.fd = -1; break; case PL_HANDLE_HOST_PTR: PL_TRACE(vk, "Imported %s bytes from ptr: %p%s", PRINT_SIZE(slab->size), shmem->handle.ptr, params->ded_image ? " (dedicated" : ""); slab->handle.ptr = ptrinfo.pHostPointer; break; case PL_HANDLE_WIN32: case PL_HANDLE_WIN32_KMT: case PL_HANDLE_IOSURFACE: case PL_HANDLE_MTL_TEX: break; } VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags; if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; out->data = (uint8_t *) slab->data + out->offset; out->coherent = slab->coherent; if (!slab->coherent) { // Use entire buffer range, since this is a dedicated memory // allocation. This avoids issues with noncoherent atomicity out->map_offset = 0; out->map_size = VK_WHOLE_SIZE; // Mapping does not implicitly invalidate mapped memory VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, .memory = slab->mem, .offset = out->map_offset, .size = out->map_size, })); } } if (buffer) VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0)); return true; error: if (params->debug_tag) PL_ERR(vk, " for malloc: %s", params->debug_tag); vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC); #ifdef PL_HAVE_UNIX if (fdinfo.fd > -1) close(fdinfo.fd); #endif pl_free(slab); *out = (struct vk_memslice) {0}; return false; } size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags) { size_t avail = 0; for (int i = 0; i < ma->props.memoryTypeCount; i++) { const VkMemoryType *mtype = &ma->props.memoryTypes[i]; if ((mtype->propertyFlags & flags) != flags) continue; avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size); } return avail; } bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out, const struct vk_malloc_params *params) { struct vk_ctx *vk = ma->vk; pl_assert(!params->import_handle || !params->export_handle); if (params->import_handle) return vk_malloc_import(ma, out, params); pl_assert(params->reqs.size); size_t size = params->reqs.size; size_t align = params->reqs.alignment; align = pl_lcm(align, vk->props.limits.bufferImageGranularity); align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize); struct vk_slab *slab; VkDeviceSize offset; if (params->ded_image || size > ma->maximum_page_size) { slab = slab_alloc(ma, params); if (!slab) return false; slab->dedicated = true; offset = 0; } else { pl_mutex_lock(&ma->lock); struct vk_pool *pool = find_pool(ma, params); slab = pool_get_page(ma, pool, size, align, &offset); pl_mutex_unlock(&ma->lock); if (!slab) { PL_ERR(ma->vk, "No slab to serve request for %s bytes (with " "alignment 0x%zx) in pool %d!", PRINT_SIZE(size), align, pool->index); return false; } // For accounting, just treat the alignment as part of the used size. // Doing it this way makes sure that the sizes reported to vk_memslice // consumers are always aligned properly. size = PL_ALIGN(size, align); slab->used += size; slab->age = ma->age; if (params->debug_tag) slab->debug_tag = params->debug_tag; pl_mutex_unlock(&slab->lock); } pl_assert(offset % align == 0); *out = (struct vk_memslice) { .vkmem = slab->mem, .offset = offset, .size = size, .buf = slab->buffer, .data = slab->data ? (uint8_t *) slab->data + offset : 0x0, .coherent = slab->coherent, .map_offset = slab->data ? offset : 0, .map_size = slab->data ? size : 0, .priv = slab, .shared_mem = { .handle = slab->handle, .offset = offset, .size = slab->size, }, }; return true; }