/* * This file is part of libplacebo. * * libplacebo is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * libplacebo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with libplacebo. If not, see . */ #include "gpu.h" void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, VkPipelineStageFlags2 stage, VkAccessFlags2 access, size_t offset, size_t size, bool export) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *buf_vk = PL_PRIV(buf); pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers pl_rc_ref(&buf_vk->rc); bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR; bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent; if (needs_flush && noncoherent) { VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) { .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, .memory = buf_vk->mem.vkmem, .offset = buf_vk->mem.map_offset, .size = buf_vk->mem.map_size, })); // Just ignore errors, not much we can do about them other than // logging them and moving on... error: ; } struct vk_sync_scope last; last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export); // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE // buffers require transitioning to/from the concrete QF index uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf; uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; if (last.access || src_qf != dst_qf) { vk_cmd_barrier(cmd, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, .srcStageMask = last.stage, .srcAccessMask = last.access, .dstStageMask = stage, .dstAccessMask = access, .srcQueueFamilyIndex = src_qf, .dstQueueFamilyIndex = dst_qf, .buffer = buf_vk->mem.buf, .offset = buf_vk->mem.offset + offset, .size = size, }, }); } buf_vk->needs_flush = false; buf_vk->exported = export; vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf); } void vk_buf_deref(pl_gpu gpu, pl_buf buf) { if (!buf) return; struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *buf_vk = PL_PRIV(buf); if (pl_rc_deref(&buf_vk->rc)) { vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC); vk_malloc_free(vk->ma, &buf_vk->mem); pl_free((void *) buf); } } pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk); buf->params = *params; buf->params.initial_data = NULL; struct pl_buf_vk *buf_vk = PL_PRIV(buf); pl_rc_init(&buf_vk->rc); struct vk_malloc_params mparams = { .reqs = { .size = PL_ALIGN2(params->size, 4), // for vk_buf_write .memoryTypeBits = UINT32_MAX, .alignment = 1, }, // these are always set, because `vk_buf_copy` can always be used .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, .export_handle = params->export_handle, .import_handle = params->import_handle, .shared_mem = params->shared_mem, .debug_tag = params->debug_tag, }; // Mandatory/optimal buffer offset alignment VkDeviceSize *align = &mparams.reqs.alignment; VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment; // Try and align all buffers to the minimum texel alignment, to make sure // tex_upload/tex_download always gets aligned buffer copies if possible extra_align = pl_lcm(extra_align, p->min_texel_alignment); enum pl_buf_mem_type mem_type = params->memory_type; bool is_texel = false; if (params->uniform) { mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment); mem_type = PL_BUF_MEM_DEVICE; if (params->format) { mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; is_texel = true; } } if (params->storable) { mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment); buf_vk->update_queue = COMPUTE; mem_type = PL_BUF_MEM_DEVICE; if (params->format) { mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; is_texel = true; } } if (is_texel) { *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment); *align = pl_lcm(*align, params->format->texel_size); } if (params->drawable) { mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT; mem_type = PL_BUF_MEM_DEVICE; } if (params->host_writable || params->initial_data) { // Buffers should be written using mapped memory if possible mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; // Use the transfer queue for updates on very large buffers (1 MB) if (params->size > 1024*1024) buf_vk->update_queue = TRANSFER; } if (params->host_mapped || params->host_readable) { mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; if (params->size > 1024) { // Prefer cached memory for large buffers (1 kB) which may be read // from, because uncached reads are extremely slow mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; } } switch (mem_type) { case PL_BUF_MEM_AUTO: // We generally prefer VRAM since it's faster than RAM, but any number // of other requirements could potentially exclude it, so just mark it // as optimal by default. if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; break; case PL_BUF_MEM_DEVICE: // Force device local memory. mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; break; case PL_BUF_MEM_HOST: // This isn't a true guarantee, but actually trying to restrict the // device-local bit locks out all memory heaps on iGPUs. Requiring // the memory be host-mapped is the easiest compromise. mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; break; case PL_BUF_MEM_TYPE_COUNT: pl_unreachable(); } if (params->import_handle) { size_t offset = params->shared_mem.offset; if (PL_ALIGN(offset, *align) != offset) { PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment " "requirement of enabled usage flags (%zu)!", offset, (size_t) *align); goto error; } } else { *align = pl_lcm(*align, extra_align); } if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams)) goto error; if (params->host_mapped) buf->data = buf_vk->mem.data; if (params->export_handle) { buf->shared_mem = buf_vk->mem.shared_mem; buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR; buf_vk->exported = true; } if (is_texel) { struct pl_fmt_vk *fmtp = PL_PRIV(params->format); VkBufferViewCreateInfo vinfo = { .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, .buffer = buf_vk->mem.buf, .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt), .offset = buf_vk->mem.offset, .range = buf_vk->mem.size, }; VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view)); PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel")); } if (params->initial_data) vk_buf_write(gpu, buf, 0, params->initial_data, params->size); return buf; error: vk_buf_deref(gpu, buf); return NULL; } static void invalidate_buf(pl_gpu gpu, pl_buf buf) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *buf_vk = PL_PRIV(buf); if (buf_vk->mem.data && !buf_vk->mem.coherent) { VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, .memory = buf_vk->mem.vkmem, .offset = buf_vk->mem.map_offset, .size = buf_vk->mem.map_size, })); } // Ignore errors (after logging), nothing useful we can do anyway error: ; vk_buf_deref(gpu, buf); } void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, size_t offset, size_t size) { struct pl_buf_vk *buf_vk = PL_PRIV(buf); // We need to perform a flush if the host is capable of reading back from // the buffer, or if we intend to overwrite it using mapped memory bool can_read = buf->params.host_readable; bool can_write = buf_vk->mem.data && buf->params.host_writable; if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR) can_read = can_write = true; if (!can_read && !can_write) return; vk_cmd_barrier(cmd, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, .srcStageMask = buf_vk->sem.write.stage, .srcAccessMask = buf_vk->sem.write.access, .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0) | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0), .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = buf_vk->mem.buf, .offset = buf_vk->mem.offset + offset, .size = size, }, }); // We need to hold on to the buffer until this barrier completes vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf); pl_rc_ref(&buf_vk->rc); } bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *buf_vk = PL_PRIV(buf); // Opportunistically check if we can re-use this buffer without flush vk_poll_commands(vk, 0); if (pl_rc_count(&buf_vk->rc) == 1) return false; // Otherwise, we're force to submit any queued command so that the // user is guaranteed to see progress eventually, even if they call // this in a tight loop CMD_SUBMIT(NULL); vk_poll_commands(vk, timeout); return pl_rc_count(&buf_vk->rc) > 1; } void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data, size_t size) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *buf_vk = PL_PRIV(buf); // For host-mapped buffers, we can just directly memcpy the buffer contents. // Otherwise, we can update the buffer from the GPU using a command buffer. if (buf_vk->mem.data) { // ensure no queued operations while (vk_buf_poll(gpu, buf, UINT64_MAX)) ; // do nothing uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset; memcpy((void *) addr, data, size); buf_vk->needs_flush = true; } else { struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue); if (!cmd) { PL_ERR(gpu, "Failed updating buffer!"); return; } vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false); // Vulkan requires `size` to be a multiple of 4, so we need to make // sure to handle the end separately if the original data is not const size_t max_transfer = 64 * 1024; size_t size_rem = size % 4; size_t size_base = size - size_rem; VkDeviceSize buf_offset = buf_vk->mem.offset + offset; if (size_base > max_transfer) { PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload " "large buffer. Consider using buffer-buffer transfers " "instead!"); } for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) { vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + xfer, PL_MIN(size_base, max_transfer), (void *) ((uint8_t *) data + xfer)); } if (size_rem) { uint8_t tail[4] = {0}; memcpy(tail, data, size_rem); vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base, sizeof(tail), tail); } pl_assert(!buf->params.host_readable); // no flush needed due to this CMD_FINISH(&cmd); } } bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *buf_vk = PL_PRIV(buf); pl_assert(buf_vk->mem.data); if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) { // ensure no more queued writes VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, .semaphoreCount = 1, .pSemaphores = &buf_vk->sem.write.sync.sem, .pValues = &buf_vk->sem.write.sync.value, }, UINT64_MAX)); // process callbacks vk_poll_commands(vk, 0); } uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset; memcpy(dest, (void *) addr, size); return true; error: return false; } void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src, size_t src_offset, size_t size) { struct pl_vk *p = PL_PRIV(gpu); struct vk_ctx *vk = p->vk; struct pl_buf_vk *dst_vk = PL_PRIV(dst); struct pl_buf_vk *src_vk = PL_PRIV(src); struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue); if (!cmd) { PL_ERR(gpu, "Failed copying buffer!"); return; } vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false); vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false); VkBufferCopy region = { .srcOffset = src_vk->mem.offset + src_offset, .dstOffset = dst_vk->mem.offset + dst_offset, .size = size, }; vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf, 1, ®ion); vk_buf_flush(gpu, cmd, dst, dst_offset, size); CMD_FINISH(&cmd); } bool vk_buf_export(pl_gpu gpu, pl_buf buf) { struct pl_buf_vk *buf_vk = PL_PRIV(buf); if (buf_vk->exported) return true; struct vk_cmd *cmd = CMD_BEGIN(ANY); if (!cmd) { PL_ERR(gpu, "Failed exporting buffer!"); return false; } // For the queue family ownership transfer, we can ignore all pipeline // stages since the synchronization via fences/semaphores is required vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0, buf->params.size, true); return CMD_SUBMIT(&cmd); }