diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 20:38:23 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 20:38:23 +0000 |
commit | ff6e3c025658a5fa1affd094f220b623e7e1b24b (patch) | |
tree | 9faab72d69c92d24e349d184f5869b9796f17e0c /src/gpu | |
parent | Initial commit. (diff) | |
download | libplacebo-upstream.tar.xz libplacebo-upstream.zip |
Adding upstream version 6.338.2.upstream/6.338.2upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/gpu.c | 1338 | ||||
-rw-r--r-- | src/gpu.h | 207 | ||||
-rw-r--r-- | src/gpu/utils.c | 1288 |
3 files changed, 2833 insertions, 0 deletions
diff --git a/src/gpu.c b/src/gpu.c new file mode 100644 index 0000000..b639ec2 --- /dev/null +++ b/src/gpu.c @@ -0,0 +1,1338 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "gpu.h" + +#define require(expr) pl_require(gpu, expr) + +void pl_gpu_destroy(pl_gpu gpu) +{ + if (!gpu) + return; + + struct pl_gpu_fns *impl = PL_PRIV(gpu); + pl_dispatch_destroy(&impl->dp); + impl->destroy(gpu); +} + +pl_dispatch pl_gpu_dispatch(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->dp; +} + +pl_cache pl_gpu_cache(pl_gpu gpu) +{ + if (!gpu) + return NULL; + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return atomic_load(&impl->cache); +} + +void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache) +{ + struct pl_gpu_fns *impl = PL_PRIV(gpu); + atomic_store(&impl->cache, cache); +} + +bool pl_fmt_is_ordered(pl_fmt fmt) +{ + bool ret = !fmt->opaque; + for (int i = 0; i < fmt->num_components; i++) + ret &= fmt->sample_order[i] == i; + return ret; +} + +bool pl_fmt_is_float(pl_fmt fmt) +{ + switch (fmt->type) { + case PL_FMT_UNKNOWN: // more likely than not + case PL_FMT_FLOAT: + case PL_FMT_UNORM: + case PL_FMT_SNORM: + return true; + + case PL_FMT_UINT: + case PL_FMT_SINT: + return false; + + case PL_FMT_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier) +{ + if (!fmt) + return false; + + for (int i = 0; i < fmt->num_modifiers; i++) { + if (fmt->modifiers[i] == modifier) + return true; + } + + return false; +} + +pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components, + int min_depth, int host_bits, enum pl_fmt_caps caps) +{ + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + if (fmt->type != type || fmt->num_components != num_components) + continue; + if ((fmt->caps & caps) != caps) + continue; + + // When specifying some particular host representation, ensure the + // format is non-opaque, ordered and unpadded + if (host_bits && fmt->opaque) + continue; + if (host_bits && fmt->texel_size * 8 != host_bits * num_components) + continue; + if (host_bits && !pl_fmt_is_ordered(fmt)) + continue; + + for (int i = 0; i < fmt->num_components; i++) { + if (fmt->component_depth[i] < min_depth) + goto next_fmt; + if (host_bits && fmt->host_bits[i] != host_bits) + goto next_fmt; + } + + return fmt; + +next_fmt: ; // equivalent to `continue` + } + + // ran out of formats + PL_TRACE(gpu, "No matching format found"); + return NULL; +} + +pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int comps) +{ + static const size_t sizes[] = { + [PL_FMT_FLOAT] = sizeof(float), + [PL_FMT_UNORM] = sizeof(unsigned), + [PL_FMT_UINT] = sizeof(unsigned), + [PL_FMT_SNORM] = sizeof(int), + [PL_FMT_SINT] = sizeof(int), + }; + + return pl_find_fmt(gpu, type, comps, 0, 8 * sizes[type], PL_FMT_CAP_VERTEX); +} + +pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name) +{ + if (!name) + return NULL; + + for (int i = 0; i < gpu->num_formats; i++) { + pl_fmt fmt = gpu->formats[i]; + if (strcmp(name, fmt->name) == 0) + return fmt; + } + + // ran out of formats + return NULL; +} + +pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc) +{ + if (!fourcc) + return NULL; + + for (int i = 0; i < gpu->num_formats; i++) { + pl_fmt fmt = gpu->formats[i]; + if (fourcc == fmt->fourcc) + return fmt; + } + + // ran out of formats + return NULL; +} + +static inline bool check_mod(pl_gpu gpu, pl_fmt fmt, uint64_t mod) +{ + for (int i = 0; i < fmt->num_modifiers; i++) { + if (fmt->modifiers[i] == mod) + return true; + } + + + PL_ERR(gpu, "DRM modifier %s not available for format %s. Available modifiers:", + PRINT_DRM_MOD(mod), fmt->name); + for (int i = 0; i < fmt->num_modifiers; i++) + PL_ERR(gpu, " %s", PRINT_DRM_MOD(fmt->modifiers[i])); + + return false; +} + +pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + require(params->format); + require(!params->import_handle || !params->export_handle); + require(!params->import_handle || !params->initial_data); + if (params->export_handle) { + require(params->export_handle & gpu->export_caps.tex); + require(PL_ISPOT(params->export_handle)); + } + if (params->import_handle) { + require(params->import_handle & gpu->import_caps.tex); + require(PL_ISPOT(params->import_handle)); + if (params->import_handle == PL_HANDLE_DMA_BUF) { + if (!check_mod(gpu, params->format, params->shared_mem.drm_format_mod)) + goto error; + if (params->shared_mem.stride_w) + require(params->w && params->shared_mem.stride_w >= params->w); + if (params->shared_mem.stride_h) + require(params->h && params->shared_mem.stride_h >= params->h); + } else if (params->import_handle == PL_HANDLE_MTL_TEX) { + require(params->shared_mem.plane <= 2); + } + } + + switch (pl_tex_params_dimension(*params)) { + case 1: + require(params->w > 0); + require(params->w <= gpu->limits.max_tex_1d_dim); + require(!params->renderable); + require(!params->blit_src || gpu->limits.blittable_1d_3d); + require(!params->blit_dst || gpu->limits.blittable_1d_3d); + require(!params->format->num_planes); + break; + case 2: + require(params->w > 0 && params->h > 0); + require(params->w <= gpu->limits.max_tex_2d_dim); + require(params->h <= gpu->limits.max_tex_2d_dim); + break; + case 3: + require(params->w > 0 && params->h > 0 && params->d > 0); + require(params->w <= gpu->limits.max_tex_3d_dim); + require(params->h <= gpu->limits.max_tex_3d_dim); + require(params->d <= gpu->limits.max_tex_3d_dim); + require(!params->renderable); + require(!params->blit_src || gpu->limits.blittable_1d_3d); + require(!params->blit_dst || gpu->limits.blittable_1d_3d); + require(!params->format->num_planes); + break; + } + + enum pl_fmt_caps fmt_caps = params->format->caps; + bool fmt_opaque = params->format->opaque; + for (int i = 0; i < params->format->num_planes; i++) { + pl_fmt pfmt = params->format->planes[i].format; + fmt_caps |= pfmt->caps; + fmt_opaque &= pfmt->opaque; + } + + require(!params->host_readable || fmt_caps & PL_FMT_CAP_HOST_READABLE); + require(!params->host_writable || !fmt_opaque); + require(!params->sampleable || fmt_caps & PL_FMT_CAP_SAMPLEABLE); + require(!params->renderable || fmt_caps & PL_FMT_CAP_RENDERABLE); + require(!params->storable || fmt_caps & PL_FMT_CAP_STORABLE); + require(!params->blit_src || fmt_caps & PL_FMT_CAP_BLITTABLE); + require(!params->blit_dst || fmt_caps & PL_FMT_CAP_BLITTABLE); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_create(gpu, params); + +error: + if (params->debug_tag) + PL_ERR(gpu, " for texture: %s", params->debug_tag); + return NULL; +} + +void pl_tex_destroy(pl_gpu gpu, pl_tex *tex) +{ + if (!*tex) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->tex_destroy(gpu, *tex); + *tex = NULL; +} + +static bool pl_tex_params_superset(struct pl_tex_params a, struct pl_tex_params b) +{ + return a.w == b.w && a.h == b.h && a.d == b.d && + a.format == b.format && + (a.sampleable || !b.sampleable) && + (a.renderable || !b.renderable) && + (a.storable || !b.storable) && + (a.blit_src || !b.blit_src) && + (a.blit_dst || !b.blit_dst) && + (a.host_writable || !b.host_writable) && + (a.host_readable || !b.host_readable); +} + +bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params) +{ + if (params->initial_data) { + PL_ERR(gpu, "pl_tex_recreate may not be used with `initial_data`!"); + return false; + } + + if (params->import_handle) { + PL_ERR(gpu, "pl_tex_recreate may not be used with `import_handle`!"); + return false; + } + + if (*tex && pl_tex_params_superset((*tex)->params, *params)) { + pl_tex_invalidate(gpu, *tex); + return true; + } + + PL_DEBUG(gpu, "(Re)creating %dx%dx%d texture with format %s: %s", + params->w, params->h, params->d, params->format->name, + PL_DEF(params->debug_tag, "unknown")); + + pl_tex_destroy(gpu, tex); + *tex = pl_tex_create(gpu, params); + + return !!*tex; +} + +void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color) +{ + require(dst->params.blit_dst); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->tex_invalidate) + impl->tex_invalidate(gpu, dst); + impl->tex_clear_ex(gpu, dst, color); + return; + +error: + if (dst->params.debug_tag) + PL_ERR(gpu, " for texture: %s", dst->params.debug_tag); +} + +void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]) +{ + if (!pl_fmt_is_float(dst->params.format)) { + PL_ERR(gpu, "Cannot call `pl_tex_clear` on integer textures, please " + "use `pl_tex_clear_ex` instead."); + return; + } + + const union pl_clear_color col = { + .f = { color[0], color[1], color[2], color[3] }, + }; + + pl_tex_clear_ex(gpu, dst, col); +} + +void pl_tex_invalidate(pl_gpu gpu, pl_tex tex) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->tex_invalidate) + impl->tex_invalidate(gpu, tex); +} + +static void strip_coords(pl_tex tex, pl_rect3d *rc) +{ + if (!tex->params.d) { + rc->z0 = 0; + rc->z1 = 1; + } + + if (!tex->params.h) { + rc->y0 = 0; + rc->y1 = 1; + } +} + +static void infer_rc(pl_tex tex, pl_rect3d *rc) +{ + if (!rc->x0 && !rc->x1) + rc->x1 = tex->params.w; + if (!rc->y0 && !rc->y1) + rc->y1 = tex->params.h; + if (!rc->z0 && !rc->z1) + rc->z1 = tex->params.d; +} + +void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + pl_tex src = params->src, dst = params->dst; + require(src && dst); + pl_fmt src_fmt = src->params.format; + pl_fmt dst_fmt = dst->params.format; + require(src_fmt->internal_size == dst_fmt->internal_size); + require((src_fmt->type == PL_FMT_UINT) == (dst_fmt->type == PL_FMT_UINT)); + require((src_fmt->type == PL_FMT_SINT) == (dst_fmt->type == PL_FMT_SINT)); + require(src->params.blit_src); + require(dst->params.blit_dst); + require(params->sample_mode != PL_TEX_SAMPLE_LINEAR || (src_fmt->caps & PL_FMT_CAP_LINEAR)); + + struct pl_tex_blit_params fixed = *params; + infer_rc(src, &fixed.src_rc); + infer_rc(dst, &fixed.dst_rc); + strip_coords(src, &fixed.src_rc); + strip_coords(dst, &fixed.dst_rc); + + require(fixed.src_rc.x0 >= 0 && fixed.src_rc.x0 < src->params.w); + require(fixed.src_rc.x1 > 0 && fixed.src_rc.x1 <= src->params.w); + require(fixed.dst_rc.x0 >= 0 && fixed.dst_rc.x0 < dst->params.w); + require(fixed.dst_rc.x1 > 0 && fixed.dst_rc.x1 <= dst->params.w); + + if (src->params.h) { + require(fixed.src_rc.y0 >= 0 && fixed.src_rc.y0 < src->params.h); + require(fixed.src_rc.y1 > 0 && fixed.src_rc.y1 <= src->params.h); + } + + if (dst->params.h) { + require(fixed.dst_rc.y0 >= 0 && fixed.dst_rc.y0 < dst->params.h); + require(fixed.dst_rc.y1 > 0 && fixed.dst_rc.y1 <= dst->params.h); + } + + if (src->params.d) { + require(fixed.src_rc.z0 >= 0 && fixed.src_rc.z0 < src->params.d); + require(fixed.src_rc.z1 > 0 && fixed.src_rc.z1 <= src->params.d); + } + + if (dst->params.d) { + require(fixed.dst_rc.z0 >= 0 && fixed.dst_rc.z0 < dst->params.d); + require(fixed.dst_rc.z1 > 0 && fixed.dst_rc.z1 <= dst->params.d); + } + + pl_rect3d full = {0, 0, 0, dst->params.w, dst->params.h, dst->params.d}; + strip_coords(dst, &full); + + pl_rect3d rcnorm = fixed.dst_rc; + pl_rect3d_normalize(&rcnorm); + if (pl_rect3d_eq(rcnorm, full)) + pl_tex_invalidate(gpu, dst); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->tex_blit(gpu, &fixed); + return; + +error: + if (src->params.debug_tag || dst->params.debug_tag) { + PL_ERR(gpu, " for textures: src %s, dst %s", + PL_DEF(src->params.debug_tag, "(unknown)"), + PL_DEF(dst->params.debug_tag, "(unknown)")); + } +} + +static bool fix_tex_transfer(pl_gpu gpu, struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_rect3d rc = params->rc; + + // Infer the default values + infer_rc(tex, &rc); + strip_coords(tex, &rc); + + if (!params->row_pitch || !tex->params.w) + params->row_pitch = pl_rect_w(rc) * fmt->texel_size; + if (!params->depth_pitch || !tex->params.d) + params->depth_pitch = pl_rect_h(rc) * params->row_pitch; + + require(params->row_pitch); + require(params->depth_pitch); + params->rc = rc; + + // Check the parameters for sanity + switch (pl_tex_params_dimension(tex->params)) + { + case 3: + require(rc.z1 > rc.z0); + require(rc.z0 >= 0 && rc.z0 < tex->params.d); + require(rc.z1 > 0 && rc.z1 <= tex->params.d); + require(params->depth_pitch >= pl_rect_h(rc) * params->row_pitch); + require(params->depth_pitch % params->row_pitch == 0); + // fall through + case 2: + require(rc.y1 > rc.y0); + require(rc.y0 >= 0 && rc.y0 < tex->params.h); + require(rc.y1 > 0 && rc.y1 <= tex->params.h); + require(params->row_pitch >= pl_rect_w(rc) * fmt->texel_size); + require(params->row_pitch % fmt->texel_align == 0); + // fall through + case 1: + require(rc.x1 > rc.x0); + require(rc.x0 >= 0 && rc.x0 < tex->params.w); + require(rc.x1 > 0 && rc.x1 <= tex->params.w); + break; + } + + require(!params->buf ^ !params->ptr); // exactly one + if (params->buf) { + pl_buf buf = params->buf; + size_t size = pl_tex_transfer_size(params); + require(params->buf_offset + size >= params->buf_offset); // overflow check + require(params->buf_offset + size <= buf->params.size); + require(gpu->limits.buf_transfer); + } + + require(!params->callback || gpu->limits.callbacks); + return true; + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + require(tex->params.host_writable); + + struct pl_tex_transfer_params fixed = *params; + if (!fix_tex_transfer(gpu, &fixed)) + goto error; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_upload(gpu, &fixed); + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + require(tex->params.host_readable); + + struct pl_tex_transfer_params fixed = *params; + if (!fix_tex_transfer(gpu, &fixed)) + goto error; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_download(gpu, &fixed); + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t t) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_poll ? impl->tex_poll(gpu, tex, t) : false; +} + +pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_buf_params params_rounded; + + require(!params->import_handle || !params->export_handle); + if (params->export_handle) { + require(PL_ISPOT(params->export_handle)); + require(params->export_handle & gpu->export_caps.buf); + } + if (params->import_handle) { + require(PL_ISPOT(params->import_handle)); + require(params->import_handle & gpu->import_caps.buf); + const struct pl_shared_mem *shmem = ¶ms->shared_mem; + require(shmem->offset + params->size <= shmem->size); + require(params->import_handle != PL_HANDLE_DMA_BUF || !shmem->drm_format_mod); + + // Fix misalignment on host pointer imports + if (params->import_handle == PL_HANDLE_HOST_PTR) { + uintptr_t page_mask = ~(gpu->limits.align_host_ptr - 1); + uintptr_t ptr_base = (uintptr_t) shmem->handle.ptr & page_mask; + size_t ptr_offset = (uintptr_t) shmem->handle.ptr - ptr_base; + size_t buf_offset = ptr_offset + shmem->offset; + size_t ptr_size = PL_ALIGN2(ptr_offset + shmem->size, + gpu->limits.align_host_ptr); + + if (ptr_base != (uintptr_t) shmem->handle.ptr || ptr_size > shmem->size) { + static bool warned_rounding = false; + if (!warned_rounding) { + warned_rounding = true; + PL_WARN(gpu, "Imported host pointer is not page-aligned. " + "This should normally be fine on most platforms, " + "but may cause issues in some rare circumstances."); + } + + PL_TRACE(gpu, "Rounding imported host pointer %p + %zu -> %zu to " + "nearest page boundaries: %p + %zu -> %zu", + shmem->handle.ptr, shmem->offset, shmem->size, + (void *) ptr_base, buf_offset, ptr_size); + } + + params_rounded = *params; + params_rounded.shared_mem.handle.ptr = (void *) ptr_base; + params_rounded.shared_mem.offset = buf_offset; + params_rounded.shared_mem.size = ptr_size; + params = ¶ms_rounded; + } + } + + require(params->size > 0 && params->size <= gpu->limits.max_buf_size); + require(!params->uniform || params->size <= gpu->limits.max_ubo_size); + require(!params->storable || params->size <= gpu->limits.max_ssbo_size); + require(!params->drawable || params->size <= gpu->limits.max_vbo_size); + require(!params->host_mapped || params->size <= gpu->limits.max_mapped_size); + + if (params->format) { + pl_fmt fmt = params->format; + require(params->size <= gpu->limits.max_buffer_texels * fmt->texel_size); + require(!params->uniform || (fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM)); + require(!params->storable || (fmt->caps & PL_FMT_CAP_TEXEL_STORAGE)); + } + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + pl_buf buf = impl->buf_create(gpu, params); + if (buf) + require(!params->host_mapped || buf->data); + + return buf; + +error: + if (params->debug_tag) + PL_ERR(gpu, " for buffer: %s", params->debug_tag); + return NULL; +} + +void pl_buf_destroy(pl_gpu gpu, pl_buf *buf) +{ + if (!*buf) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->buf_destroy(gpu, *buf); + *buf = NULL; +} + +static bool pl_buf_params_superset(struct pl_buf_params a, struct pl_buf_params b) +{ + return a.size >= b.size && + a.memory_type == b.memory_type && + a.format == b.format && + (a.host_writable || !b.host_writable) && + (a.host_readable || !b.host_readable) && + (a.host_mapped || !b.host_mapped) && + (a.uniform || !b.uniform) && + (a.storable || !b.storable) && + (a.drawable || !b.drawable); +} + +bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params) +{ + + if (params->initial_data) { + PL_ERR(gpu, "pl_buf_recreate may not be used with `initial_data`!"); + return false; + } + + if (*buf && pl_buf_params_superset((*buf)->params, *params)) + return true; + + PL_INFO(gpu, "(Re)creating %zu buffer", params->size); + pl_buf_destroy(gpu, buf); + *buf = pl_buf_create(gpu, params); + + return !!*buf; +} + +void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset, + const void *data, size_t size) +{ + require(buf->params.host_writable); + require(buf_offset + size <= buf->params.size); + require(buf_offset == PL_ALIGN2(buf_offset, 4)); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->buf_write(gpu, buf, buf_offset, data, size); + return; + +error: + if (buf->params.debug_tag) + PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag); +} + +bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset, + void *dest, size_t size) +{ + require(buf->params.host_readable); + require(buf_offset + size <= buf->params.size); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->buf_read(gpu, buf, buf_offset, dest, size); + +error: + if (buf->params.debug_tag) + PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag); + return false; +} + +void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + require(src_offset + size <= src->params.size); + require(dst_offset + size <= dst->params.size); + require(src != dst); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->buf_copy(gpu, dst, dst_offset, src, src_offset, size); + return; + +error: + if (src->params.debug_tag || dst->params.debug_tag) { + PL_ERR(gpu, " for buffers: src %s, dst %s", + src->params.debug_tag, dst->params.debug_tag); + } +} + +bool pl_buf_export(pl_gpu gpu, pl_buf buf) +{ + require(buf->params.export_handle || buf->params.import_handle); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->buf_export(gpu, buf); + +error: + if (buf->params.debug_tag) + PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag); + return false; +} + +bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t t) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->buf_poll ? impl->buf_poll(gpu, buf, t) : false; +} + +size_t pl_var_type_size(enum pl_var_type type) +{ + switch (type) { + case PL_VAR_SINT: return sizeof(int); + case PL_VAR_UINT: return sizeof(unsigned int); + case PL_VAR_FLOAT: return sizeof(float); + case PL_VAR_INVALID: // fall through + case PL_VAR_TYPE_COUNT: break; + } + + pl_unreachable(); +} + +#define PL_VAR(TYPE, NAME, M, V) \ + struct pl_var pl_var_##NAME(const char *name) { \ + return (struct pl_var) { \ + .name = name, \ + .type = PL_VAR_##TYPE, \ + .dim_m = M, \ + .dim_v = V, \ + .dim_a = 1, \ + }; \ + } + +PL_VAR(FLOAT, float, 1, 1) +PL_VAR(FLOAT, vec2, 1, 2) +PL_VAR(FLOAT, vec3, 1, 3) +PL_VAR(FLOAT, vec4, 1, 4) +PL_VAR(FLOAT, mat2, 2, 2) +PL_VAR(FLOAT, mat2x3, 2, 3) +PL_VAR(FLOAT, mat2x4, 2, 4) +PL_VAR(FLOAT, mat3, 3, 3) +PL_VAR(FLOAT, mat3x4, 3, 4) +PL_VAR(FLOAT, mat4x2, 4, 2) +PL_VAR(FLOAT, mat4x3, 4, 3) +PL_VAR(FLOAT, mat4, 4, 4) +PL_VAR(SINT, int, 1, 1) +PL_VAR(SINT, ivec2, 1, 2) +PL_VAR(SINT, ivec3, 1, 3) +PL_VAR(SINT, ivec4, 1, 4) +PL_VAR(UINT, uint, 1, 1) +PL_VAR(UINT, uvec2, 1, 2) +PL_VAR(UINT, uvec3, 1, 3) +PL_VAR(UINT, uvec4, 1, 4) + +#undef PL_VAR + +const struct pl_named_var pl_var_glsl_types[] = { + // float vectors + { "float", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }}, + { "vec2", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }}, + { "vec3", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }}, + { "vec4", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }}, + // float matrices + { "mat2", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 2, .dim_a = 1, }}, + { "mat2x3", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 3, .dim_a = 1, }}, + { "mat2x4", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 4, .dim_a = 1, }}, + { "mat3", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 3, .dim_a = 1, }}, + { "mat3x4", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 4, .dim_a = 1, }}, + { "mat4x2", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 2, .dim_a = 1, }}, + { "mat4x3", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 3, .dim_a = 1, }}, + { "mat4", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 4, .dim_a = 1, }}, + // integer vectors + { "int", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }}, + { "ivec2", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }}, + { "ivec3", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }}, + { "ivec4", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }}, + // unsigned integer vectors + { "uint", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }}, + { "uvec2", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }}, + { "uvec3", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }}, + { "uvec4", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }}, + + {0}, +}; + +#define MAX_DIM 4 + +const char *pl_var_glsl_type_name(struct pl_var var) +{ + static const char *types[PL_VAR_TYPE_COUNT][MAX_DIM+1][MAX_DIM+1] = { + // float vectors + [PL_VAR_FLOAT][1][1] = "float", + [PL_VAR_FLOAT][1][2] = "vec2", + [PL_VAR_FLOAT][1][3] = "vec3", + [PL_VAR_FLOAT][1][4] = "vec4", + // float matrices + [PL_VAR_FLOAT][2][2] = "mat2", + [PL_VAR_FLOAT][2][3] = "mat2x3", + [PL_VAR_FLOAT][2][4] = "mat2x4", + [PL_VAR_FLOAT][3][2] = "mat3x2", + [PL_VAR_FLOAT][3][3] = "mat3", + [PL_VAR_FLOAT][3][4] = "mat3x4", + [PL_VAR_FLOAT][4][2] = "mat4x2", + [PL_VAR_FLOAT][4][3] = "mat4x3", + [PL_VAR_FLOAT][4][4] = "mat4", + // integer vectors + [PL_VAR_SINT][1][1] = "int", + [PL_VAR_SINT][1][2] = "ivec2", + [PL_VAR_SINT][1][3] = "ivec3", + [PL_VAR_SINT][1][4] = "ivec4", + // unsigned integer vectors + [PL_VAR_UINT][1][1] = "uint", + [PL_VAR_UINT][1][2] = "uvec2", + [PL_VAR_UINT][1][3] = "uvec3", + [PL_VAR_UINT][1][4] = "uvec4", + }; + + if (var.dim_v > MAX_DIM || var.dim_m > MAX_DIM) + return NULL; + + return types[var.type][var.dim_m][var.dim_v]; +} + +struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name) +{ + static const enum pl_var_type vartypes[] = { + [PL_FMT_FLOAT] = PL_VAR_FLOAT, + [PL_FMT_UNORM] = PL_VAR_FLOAT, + [PL_FMT_SNORM] = PL_VAR_FLOAT, + [PL_FMT_UINT] = PL_VAR_UINT, + [PL_FMT_SINT] = PL_VAR_SINT, + }; + + pl_assert(fmt->type < PL_ARRAY_SIZE(vartypes)); + return (struct pl_var) { + .type = vartypes[fmt->type], + .name = name, + .dim_v = fmt->num_components, + .dim_m = 1, + .dim_a = 1, + }; +} + +struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var) +{ + size_t col_size = pl_var_type_size(var->type) * var->dim_v; + return (struct pl_var_layout) { + .offset = offset, + .stride = col_size, + .size = col_size * var->dim_m * var->dim_a, + }; +} + +struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var) +{ + size_t el_size = pl_var_type_size(var->type); + + // std140 packing rules: + // 1. The size of generic values is their size in bytes + // 2. The size of vectors is the vector length * the base count + // 3. Matrices are treated like arrays of column vectors + // 4. The size of array rows is that of the element size rounded up to + // the nearest multiple of vec4 + // 5. All values are aligned to a multiple of their size (stride for arrays), + // with the exception of vec3 which is aligned like vec4 + size_t stride = el_size * var->dim_v; + size_t align = stride; + if (var->dim_v == 3) + align += el_size; + if (var->dim_m * var->dim_a > 1) + stride = align = PL_ALIGN2(align, sizeof(float[4])); + + return (struct pl_var_layout) { + .offset = PL_ALIGN2(offset, align), + .stride = stride, + .size = stride * var->dim_m * var->dim_a, + }; +} + +struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var) +{ + size_t el_size = pl_var_type_size(var->type); + + // std430 packing rules: like std140, except arrays/matrices are always + // "tightly" packed, even arrays/matrices of vec3s + size_t stride = el_size * var->dim_v; + size_t align = stride; + if (var->dim_v == 3) + align += el_size; + if (var->dim_m * var->dim_a > 1) + stride = align; + + return (struct pl_var_layout) { + .offset = PL_ALIGN2(offset, align), + .stride = stride, + .size = stride * var->dim_m * var->dim_a, + }; +} + +void memcpy_layout(void *dst_p, struct pl_var_layout dst_layout, + const void *src_p, struct pl_var_layout src_layout) +{ + uintptr_t src = (uintptr_t) src_p + src_layout.offset; + uintptr_t dst = (uintptr_t) dst_p + dst_layout.offset; + + if (src_layout.stride == dst_layout.stride) { + pl_assert(dst_layout.size == src_layout.size); + memcpy((void *) dst, (const void *) src, src_layout.size); + return; + } + + size_t stride = PL_MIN(src_layout.stride, dst_layout.stride); + uintptr_t end = src + src_layout.size; + while (src < end) { + pl_assert(dst < dst + dst_layout.size); + memcpy((void *) dst, (const void *) src, stride); + src += src_layout.stride; + dst += dst_layout.stride; + } +} + +int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + int ret = impl->desc_namespace(gpu, type); + pl_assert(ret >= 0 && ret < PL_DESC_TYPE_COUNT); + return ret; +} + +const char *pl_desc_access_glsl_name(enum pl_desc_access mode) +{ + switch (mode) { + case PL_DESC_ACCESS_READWRITE: return ""; + case PL_DESC_ACCESS_READONLY: return "readonly"; + case PL_DESC_ACCESS_WRITEONLY: return "writeonly"; + case PL_DESC_ACCESS_COUNT: break; + } + + pl_unreachable(); +} + +const struct pl_blend_params pl_alpha_overlay = { + .src_rgb = PL_BLEND_SRC_ALPHA, + .dst_rgb = PL_BLEND_ONE_MINUS_SRC_ALPHA, + .src_alpha = PL_BLEND_ONE, + .dst_alpha = PL_BLEND_ONE_MINUS_SRC_ALPHA, +}; + +static inline void log_shader_sources(pl_log log, enum pl_log_level level, + const struct pl_pass_params *params) +{ + if (!pl_msg_test(log, level) || !params->glsl_shader) + return; + + switch (params->type) { + case PL_PASS_RASTER: + if (!params->vertex_shader) + return; + pl_msg(log, level, "vertex shader source:"); + pl_msg_source(log, level, params->vertex_shader); + pl_msg(log, level, "fragment shader source:"); + pl_msg_source(log, level, params->glsl_shader); + return; + + case PL_PASS_COMPUTE: + pl_msg(log, level, "compute shader source:"); + pl_msg_source(log, level, params->glsl_shader); + return; + + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void log_spec_constants(pl_log log, enum pl_log_level lev, + const struct pl_pass_params *params, + const void *constant_data) +{ + if (!constant_data || !params->num_constants || !pl_msg_test(log, lev)) + return; + + pl_msg(log, lev, "Specialization constant values:"); + + uintptr_t data_base = (uintptr_t) constant_data; + for (int i = 0; i < params->num_constants; i++) { + union { + int i; + unsigned u; + float f; + } *data = (void *) (data_base + params->constants[i].offset); + int id = params->constants[i].id; + + switch (params->constants[i].type) { + case PL_VAR_SINT: pl_msg(log, lev, " constant_id=%d: %d", id, data->i); break; + case PL_VAR_UINT: pl_msg(log, lev, " constant_id=%d: %u", id, data->u); break; + case PL_VAR_FLOAT: pl_msg(log, lev, " constant_id=%d: %f", id, data->f); break; + default: pl_unreachable(); + } + } +} + +pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params) +{ + require(params->glsl_shader); + switch(params->type) { + case PL_PASS_RASTER: + require(params->vertex_shader); + require(params->vertex_stride % gpu->limits.align_vertex_stride == 0); + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_vertex_attrib va = params->vertex_attribs[i]; + require(va.name); + require(va.fmt); + require(va.fmt->caps & PL_FMT_CAP_VERTEX); + require(va.offset + va.fmt->texel_size <= params->vertex_stride); + } + + require(params->target_format); + require(params->target_format->caps & PL_FMT_CAP_RENDERABLE); + require(!params->blend_params || params->target_format->caps & PL_FMT_CAP_BLENDABLE); + require(!params->blend_params || params->load_target); + break; + case PL_PASS_COMPUTE: + require(gpu->glsl.compute); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + size_t num_var_comps = 0; + for (int i = 0; i < params->num_variables; i++) { + struct pl_var var = params->variables[i]; + num_var_comps += var.dim_v * var.dim_m * var.dim_a; + require(var.name); + require(pl_var_glsl_type_name(var)); + } + require(num_var_comps <= gpu->limits.max_variable_comps); + + require(params->num_constants <= gpu->limits.max_constants); + for (int i = 0; i < params->num_constants; i++) + require(params->constants[i].type); + + for (int i = 0; i < params->num_descriptors; i++) { + struct pl_desc desc = params->descriptors[i]; + require(desc.name); + + // enforce disjoint descriptor bindings for each namespace + int namespace = pl_desc_namespace(gpu, desc.type); + for (int j = i+1; j < params->num_descriptors; j++) { + struct pl_desc other = params->descriptors[j]; + require(desc.binding != other.binding || + namespace != pl_desc_namespace(gpu, other.type)); + } + } + + require(params->push_constants_size <= gpu->limits.max_pushc_size); + require(params->push_constants_size == PL_ALIGN2(params->push_constants_size, 4)); + + log_shader_sources(gpu->log, PL_LOG_DEBUG, params); + log_spec_constants(gpu->log, PL_LOG_DEBUG, params, params->constant_data); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + pl_pass pass = impl->pass_create(gpu, params); + if (!pass) + goto error; + + return pass; + +error: + log_shader_sources(gpu->log, PL_LOG_ERR, params); + pl_log_stack_trace(gpu->log, PL_LOG_ERR); + pl_debug_abort(); + return NULL; +} + +void pl_pass_destroy(pl_gpu gpu, pl_pass *pass) +{ + if (!*pass) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->pass_destroy(gpu, *pass); + *pass = NULL; +} + +void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + pl_pass pass = params->pass; + struct pl_pass_run_params new = *params; + + for (int i = 0; i < pass->params.num_descriptors; i++) { + struct pl_desc desc = pass->params.descriptors[i]; + struct pl_desc_binding db = params->desc_bindings[i]; + require(db.object); + switch (desc.type) { + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = db.object; + pl_fmt fmt = tex->params.format; + require(tex->params.sampleable); + require(db.sample_mode != PL_TEX_SAMPLE_LINEAR || (fmt->caps & PL_FMT_CAP_LINEAR)); + break; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = db.object; + pl_fmt fmt = tex->params.format; + require(tex->params.storable); + require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE)); + break; + } + case PL_DESC_BUF_UNIFORM: { + pl_buf buf = db.object; + require(buf->params.uniform); + break; + } + case PL_DESC_BUF_STORAGE: { + pl_buf buf = db.object; + require(buf->params.storable); + break; + } + case PL_DESC_BUF_TEXEL_UNIFORM: { + pl_buf buf = db.object; + require(buf->params.uniform && buf->params.format); + break; + } + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = db.object; + pl_fmt fmt = buf->params.format; + require(buf->params.storable && buf->params.format); + require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE)); + break; + } + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + + for (int i = 0; i < params->num_var_updates; i++) { + struct pl_var_update vu = params->var_updates[i]; + require(vu.index >= 0 && vu.index < pass->params.num_variables); + require(vu.data); + } + + require(params->push_constants || !pass->params.push_constants_size); + + switch (pass->params.type) { + case PL_PASS_RASTER: { + switch (pass->params.vertex_type) { + case PL_PRIM_TRIANGLE_LIST: + require(params->vertex_count % 3 == 0); + // fall through + case PL_PRIM_TRIANGLE_STRIP: + require(params->vertex_count >= 3); + break; + case PL_PRIM_TYPE_COUNT: + pl_unreachable(); + } + + require(!params->vertex_data ^ !params->vertex_buf); + if (params->vertex_buf) { + pl_buf vertex_buf = params->vertex_buf; + require(vertex_buf->params.drawable); + if (!params->index_data && !params->index_buf) { + // Cannot bounds check indexed draws + size_t vert_size = params->vertex_count * pass->params.vertex_stride; + require(params->buf_offset + vert_size <= vertex_buf->params.size); + } + } + + require(!params->index_data || !params->index_buf); + if (params->index_buf) { + pl_buf index_buf = params->index_buf; + require(!params->vertex_data); + require(index_buf->params.drawable); + size_t index_size = pl_index_buf_size(params); + require(params->index_offset + index_size <= index_buf->params.size); + } + + pl_tex target = params->target; + require(target); + require(pl_tex_params_dimension(target->params) == 2); + require(target->params.format->signature == pass->params.target_format->signature); + require(target->params.renderable); + pl_rect2d *vp = &new.viewport; + pl_rect2d *sc = &new.scissors; + + // Sanitize viewport/scissors + if (!vp->x0 && !vp->x1) + vp->x1 = target->params.w; + if (!vp->y0 && !vp->y1) + vp->y1 = target->params.h; + + if (!sc->x0 && !sc->x1) + sc->x1 = target->params.w; + if (!sc->y0 && !sc->y1) + sc->y1 = target->params.h; + + // Constrain the scissors to the target dimension (to sanitize the + // underlying graphics API calls) + sc->x0 = PL_CLAMP(sc->x0, 0, target->params.w); + sc->y0 = PL_CLAMP(sc->y0, 0, target->params.h); + sc->x1 = PL_CLAMP(sc->x1, 0, target->params.w); + sc->y1 = PL_CLAMP(sc->y1, 0, target->params.h); + + // Scissors wholly outside target -> silently drop pass (also needed + // to ensure we don't cause UB by specifying invalid scissors) + if (!pl_rect_w(*sc) || !pl_rect_h(*sc)) + return; + + require(pl_rect_w(*vp) > 0); + require(pl_rect_h(*vp) > 0); + require(pl_rect_w(*sc) > 0); + require(pl_rect_h(*sc) > 0); + + if (!pass->params.load_target) + pl_tex_invalidate(gpu, target); + break; + } + case PL_PASS_COMPUTE: + for (int i = 0; i < PL_ARRAY_SIZE(params->compute_groups); i++) { + require(params->compute_groups[i] >= 0); + require(params->compute_groups[i] <= gpu->limits.max_dispatch[i]); + } + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->pass_run(gpu, &new); + +error: + return; +} + +void pl_gpu_flush(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->gpu_flush) + impl->gpu_flush(gpu); +} + +void pl_gpu_finish(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->gpu_finish(gpu); +} + +bool pl_gpu_is_failed(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (!impl->gpu_is_failed) + return false; + + return impl->gpu_is_failed(gpu); +} + +pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type) +{ + require(handle_type); + require(handle_type & gpu->export_caps.sync); + require(PL_ISPOT(handle_type)); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->sync_create(gpu, handle_type); + +error: + return NULL; +} + +void pl_sync_destroy(pl_gpu gpu, pl_sync *sync) +{ + if (!*sync) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->sync_destroy(gpu, *sync); + *sync = NULL; +} + +bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync) +{ + require(tex->params.import_handle || tex->params.export_handle); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_export(gpu, tex, sync); + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +pl_timer pl_timer_create(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (!impl->timer_create) + return NULL; + + return impl->timer_create(gpu); +} + +void pl_timer_destroy(pl_gpu gpu, pl_timer *timer) +{ + if (!*timer) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->timer_destroy(gpu, *timer); + *timer = NULL; +} + +uint64_t pl_timer_query(pl_gpu gpu, pl_timer timer) +{ + if (!timer) + return 0; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->timer_query(gpu, timer); +} diff --git a/src/gpu.h b/src/gpu.h new file mode 100644 index 0000000..e915a50 --- /dev/null +++ b/src/gpu.h @@ -0,0 +1,207 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "log.h" + +#include <libplacebo/gpu.h> +#include <libplacebo/dispatch.h> + +// To avoid having to include drm_fourcc.h +#ifndef DRM_FORMAT_MOD_LINEAR +#define DRM_FORMAT_MOD_LINEAR UINT64_C(0x0) +#define DRM_FORMAT_MOD_INVALID ((UINT64_C(1) << 56) - 1) +#endif + +// This struct must be the first member of the gpu's priv struct. The `pl_gpu` +// helpers will cast the priv struct to this struct! + +#define GPU_PFN(name) __typeof__(pl_##name) *name +struct pl_gpu_fns { + // This is a pl_dispatch used (on the pl_gpu itself!) for the purposes of + // dispatching compute shaders for performing various emulation tasks (e.g. + // partial clears, blits or emulated texture transfers, see below). + // + // Warning: Care must be taken to avoid recursive calls. + pl_dispatch dp; + + // Internal cache, or NULL. Set by the user (via pl_gpu_set_cache). + _Atomic(pl_cache) cache; + + // Destructors: These also free the corresponding objects, but they + // must not be called on NULL. (The NULL checks are done by the pl_*_destroy + // wrappers) + void (*destroy)(pl_gpu gpu); + void (*tex_destroy)(pl_gpu, pl_tex); + void (*buf_destroy)(pl_gpu, pl_buf); + void (*pass_destroy)(pl_gpu, pl_pass); + void (*sync_destroy)(pl_gpu, pl_sync); + void (*timer_destroy)(pl_gpu, pl_timer); + + GPU_PFN(tex_create); + GPU_PFN(tex_invalidate); // optional + GPU_PFN(tex_clear_ex); // optional if no blittable formats + GPU_PFN(tex_blit); // optional if no blittable formats + GPU_PFN(tex_upload); + GPU_PFN(tex_download); + GPU_PFN(tex_poll); // optional: if NULL, textures are always free to use + GPU_PFN(buf_create); + GPU_PFN(buf_write); + GPU_PFN(buf_read); + GPU_PFN(buf_copy); + GPU_PFN(buf_export); // optional if !gpu->export_caps.buf + GPU_PFN(buf_poll); // optional: if NULL, buffers are always free to use + GPU_PFN(desc_namespace); + GPU_PFN(pass_create); + GPU_PFN(pass_run); + GPU_PFN(sync_create); // optional if !gpu->export_caps.sync + GPU_PFN(tex_export); // optional if !gpu->export_caps.sync + GPU_PFN(timer_create); // optional + GPU_PFN(timer_query); // optional + GPU_PFN(gpu_flush); // optional + GPU_PFN(gpu_finish); + GPU_PFN(gpu_is_failed); // optional +}; +#undef GPU_PFN + +// All resources such as textures and buffers allocated from the GPU must be +// destroyed before calling pl_destroy. +void pl_gpu_destroy(pl_gpu gpu); + +// Returns true if the device supports interop. This is considered to be +// the case if at least one of `gpu->export/import_caps` is nonzero. +static inline bool pl_gpu_supports_interop(pl_gpu gpu) +{ + return gpu->export_caps.tex || + gpu->import_caps.tex || + gpu->export_caps.buf || + gpu->import_caps.buf || + gpu->export_caps.sync || + gpu->import_caps.sync; +} + +// Returns the GPU-internal `pl_dispatch` and `pl_cache` objects. +pl_dispatch pl_gpu_dispatch(pl_gpu gpu); +pl_cache pl_gpu_cache(pl_gpu gpu); + +// GPU-internal helpers: these should not be used outside of GPU implementations + +// This performs several tasks. It sorts the format list, logs GPU metadata, +// performs verification and fixes up backwards compatibility fields. This +// should be returned as the last step when creating a `pl_gpu`. +pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu); + +// Look up the right GLSL image format qualifier from a partially filled-in +// pl_fmt, or NULL if the format does not have a legal matching GLSL name. +// +// `components` may differ from fmt->num_components (for emulated formats) +const char *pl_fmt_glsl_format(pl_fmt fmt, int components); + +// Look up the right fourcc from a partially filled-in pl_fmt, or 0 if the +// format does not have a legal matching fourcc format. +uint32_t pl_fmt_fourcc(pl_fmt fmt); + +// Compute the total size (in bytes) of a texture transfer operation +size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par); + +// Split a tex transfer into slices. For emulated formats, `texel_fmt` gives +// the format of the underlying texel buffer. +// +// Returns the number of slices, or 0 on error (e.g. no SSBOs available). +// `out_slices` must be freed by caller (on success). +int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt, + const struct pl_tex_transfer_params *params, + struct pl_tex_transfer_params **out_slices); + +// Helper that wraps pl_tex_upload/download using texture upload buffers to +// ensure that params->buf is always set. +bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params); +bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// This requires that params.buf has been set and is of type PL_BUF_TEXEL_* +bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params); +bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// Both `src` and `dst must be storable. `src` must also be sampleable, if the +// blit requires linear sampling. Returns false if these conditions are unmet. +bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params); + +// Helper to do a 2D blit with stretch and scale using a raster pass +void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params); + +// Helper for GPU-accelerated endian swapping +// +// Note: `src` and `dst` can be the same buffer, for an in-place operation. In +// this case, `src_offset` and `dst_offset` must be the same. +struct pl_buf_copy_swap_params { + // Source of the copy operation. Must be `storable`. + pl_buf src; + size_t src_offset; + + // Destination of the copy operation. Must be `storable`. + pl_buf dst; + size_t dst_offset; + + // Number of bytes to copy. Must be a multiple of 4. + size_t size; + + // Underlying word size. Must be 2 (for 16-bit swap) or 4 (for 32-bit swap) + int wordsize; +}; + +bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params); + +void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params); + +// Make a deep-copy of the pass params. Note: cached_program etc. are not +// copied, but cleared explicitly. +struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params); + +// Helper to compute the size of an index buffer +static inline size_t pl_index_buf_size(const struct pl_pass_run_params *params) +{ + switch (params->index_fmt) { + case PL_INDEX_UINT16: return params->vertex_count * sizeof(uint16_t); + case PL_INDEX_UINT32: return params->vertex_count * sizeof(uint32_t); + case PL_INDEX_FORMAT_COUNT: break; + } + + pl_unreachable(); +} + +// Helper to compute the size of a vertex buffer required to fit all indices +size_t pl_vertex_buf_size(const struct pl_pass_run_params *params); + +// Utility function for pretty-printing UUIDs +#define UUID_SIZE 16 +#define PRINT_UUID(uuid) (print_uuid((char[3 * UUID_SIZE]){0}, (uuid))) +const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]); + +// Helper to pretty-print fourcc codes +#define PRINT_FOURCC(fcc) \ + (!(fcc) ? "" : (char[5]) { \ + (fcc) & 0xFF, \ + ((fcc) >> 8) & 0xFF, \ + ((fcc) >> 16) & 0xFF, \ + ((fcc) >> 24) & 0xFF \ + }) + +#define DRM_MOD_SIZE 26 +#define PRINT_DRM_MOD(mod) (print_drm_mod((char[DRM_MOD_SIZE]){0}, (mod))) +const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod); diff --git a/src/gpu/utils.c b/src/gpu/utils.c new file mode 100644 index 0000000..40ca84d --- /dev/null +++ b/src/gpu/utils.c @@ -0,0 +1,1288 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "shaders.h" +#include "gpu.h" + +// GPU-internal helpers + +static int cmp_fmt(const void *pa, const void *pb) +{ + pl_fmt a = *(pl_fmt *)pa; + pl_fmt b = *(pl_fmt *)pb; + + // Always prefer non-opaque formats + if (a->opaque != b->opaque) + return PL_CMP(a->opaque, b->opaque); + + // Always prefer non-emulated formats + if (a->emulated != b->emulated) + return PL_CMP(a->emulated, b->emulated); + + int ca = __builtin_popcount(a->caps), + cb = __builtin_popcount(b->caps); + if (ca != cb) + return -PL_CMP(ca, cb); // invert to sort higher values first + + // If the population count is the same but the caps are different, prefer + // the caps with a "lower" value (which tend to be more fundamental caps) + if (a->caps != b->caps) + return PL_CMP(a->caps, b->caps); + + // If the capabilities are equal, sort based on the component attributes + for (int i = 0; i < PL_ARRAY_SIZE(a->component_depth); i++) { + int da = a->component_depth[i], + db = b->component_depth[i]; + if (da != db) + return PL_CMP(da, db); + + int ha = a->host_bits[i], + hb = b->host_bits[i]; + if (ha != hb) + return PL_CMP(ha, hb); + + int oa = a->sample_order[i], + ob = b->sample_order[i]; + if (oa != ob) + return PL_CMP(oa, ob); + } + + // Fall back to sorting by the name (for stability) + return strcmp(a->name, b->name); +} + +#define FMT_BOOL(letter, cap) ((cap) ? (letter) : '-') +#define FMT_IDX4(f) (f)[0], (f)[1], (f)[2], (f)[3] + +static void print_formats(pl_gpu gpu) +{ + if (!pl_msg_test(gpu->log, PL_LOG_DEBUG)) + return; + +#define CAP_HEADER "%-12s" +#define CAP_FIELDS "%c%c%c%c%c%c%c%c%c%c%c%c" +#define CAP_VALUES \ + FMT_BOOL('S', fmt->caps & PL_FMT_CAP_SAMPLEABLE), \ + FMT_BOOL('s', fmt->caps & PL_FMT_CAP_STORABLE), \ + FMT_BOOL('L', fmt->caps & PL_FMT_CAP_LINEAR), \ + FMT_BOOL('R', fmt->caps & PL_FMT_CAP_RENDERABLE), \ + FMT_BOOL('b', fmt->caps & PL_FMT_CAP_BLENDABLE), \ + FMT_BOOL('B', fmt->caps & PL_FMT_CAP_BLITTABLE), \ + FMT_BOOL('V', fmt->caps & PL_FMT_CAP_VERTEX), \ + FMT_BOOL('u', fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM), \ + FMT_BOOL('t', fmt->caps & PL_FMT_CAP_TEXEL_STORAGE), \ + FMT_BOOL('H', fmt->caps & PL_FMT_CAP_HOST_READABLE), \ + FMT_BOOL('W', fmt->caps & PL_FMT_CAP_READWRITE), \ + FMT_BOOL('G', fmt->gatherable) + + PL_DEBUG(gpu, "GPU texture formats:"); + PL_DEBUG(gpu, " %-20s %-6s %-4s %-4s " CAP_HEADER " %-3s %-13s %-13s %-10s %-10s %-6s", + "NAME", "TYPE", "SIZE", "COMP", "CAPS", "EMU", "DEPTH", "HOST_BITS", + "GLSL_TYPE", "GLSL_FMT", "FOURCC"); + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + + static const char *types[] = { + [PL_FMT_UNKNOWN] = "UNKNOWN", + [PL_FMT_UNORM] = "UNORM", + [PL_FMT_SNORM] = "SNORM", + [PL_FMT_UINT] = "UINT", + [PL_FMT_SINT] = "SINT", + [PL_FMT_FLOAT] = "FLOAT", + }; + + static const char idx_map[4] = {'R', 'G', 'B', 'A'}; + char indices[4] = {' ', ' ', ' ', ' '}; + if (!fmt->opaque) { + for (int i = 0; i < fmt->num_components; i++) + indices[i] = idx_map[fmt->sample_order[i]]; + } + + + PL_DEBUG(gpu, " %-20s %-6s %-4zu %c%c%c%c " CAP_FIELDS " %-3s " + "{%-2d %-2d %-2d %-2d} {%-2d %-2d %-2d %-2d} %-10s %-10s %-6s", + fmt->name, types[fmt->type], fmt->texel_size, + FMT_IDX4(indices), CAP_VALUES, fmt->emulated ? "y" : "n", + FMT_IDX4(fmt->component_depth), FMT_IDX4(fmt->host_bits), + PL_DEF(fmt->glsl_type, ""), PL_DEF(fmt->glsl_format, ""), + PRINT_FOURCC(fmt->fourcc)); + +#undef CAP_HEADER +#undef CAP_FIELDS +#undef CAP_VALUES + + for (int i = 0; i < fmt->num_modifiers; i++) { + PL_TRACE(gpu, " modifiers[%d]: %s", + i, PRINT_DRM_MOD(fmt->modifiers[i])); + } + } +} + +pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu) +{ + // Sort formats + qsort(gpu->formats, gpu->num_formats, sizeof(pl_fmt), cmp_fmt); + + // Verification + pl_assert(gpu->limits.max_tex_2d_dim); + pl_assert(gpu->limits.max_variable_comps || gpu->limits.max_ubo_size); + pl_assert(gpu->limits.max_ubo_size <= gpu->limits.max_buf_size); + pl_assert(gpu->limits.max_ssbo_size <= gpu->limits.max_buf_size); + pl_assert(gpu->limits.max_vbo_size <= gpu->limits.max_buf_size); + pl_assert(gpu->limits.max_mapped_size <= gpu->limits.max_buf_size); + + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + pl_assert(fmt->name); + pl_assert(fmt->type); + pl_assert(fmt->num_components); + pl_assert(fmt->internal_size); + pl_assert(fmt->opaque ? !fmt->texel_size : fmt->texel_size); + pl_assert(!fmt->gatherable || (fmt->caps & PL_FMT_CAP_SAMPLEABLE)); + for (int i = 0; i < fmt->num_components; i++) { + pl_assert(fmt->component_depth[i]); + pl_assert(fmt->opaque ? !fmt->host_bits[i] : fmt->host_bits[i]); + } + for (int i = 0; i < fmt->num_planes; i++) + pl_assert(fmt->planes[i].format); + + enum pl_fmt_caps texel_caps = PL_FMT_CAP_VERTEX | + PL_FMT_CAP_TEXEL_UNIFORM | + PL_FMT_CAP_TEXEL_STORAGE; + + if (fmt->caps & texel_caps) { + pl_assert(fmt->glsl_type); + pl_assert(!fmt->opaque); + } + if (!fmt->opaque) { + pl_assert(fmt->texel_size && fmt->texel_align); + pl_assert((fmt->texel_size % fmt->texel_align) == 0); + pl_assert(fmt->internal_size == fmt->texel_size || fmt->emulated); + } else { + pl_assert(!fmt->texel_size && !fmt->texel_align); + pl_assert(!(fmt->caps & PL_FMT_CAP_HOST_READABLE)); + } + + // Assert uniqueness of name + for (int o = n + 1; o < gpu->num_formats; o++) + pl_assert(strcmp(fmt->name, gpu->formats[o]->name) != 0); + } + + // Print info + PL_INFO(gpu, "GPU information:"); + +#define LOG(fmt, field) \ + PL_INFO(gpu, " %-26s %" fmt, #field ":", gpu->LOG_STRUCT.field) + +#define LOG_STRUCT glsl + PL_INFO(gpu, " GLSL version: %d%s", gpu->glsl.version, + gpu->glsl.vulkan ? " (vulkan)" : gpu->glsl.gles ? " es" : ""); + if (gpu->glsl.compute) { + LOG("zu", max_shmem_size); + LOG(PRIu32, max_group_threads); + LOG(PRIu32, max_group_size[0]); + LOG(PRIu32, max_group_size[1]); + LOG(PRIu32, max_group_size[2]); + } + LOG(PRIu32, subgroup_size); + LOG(PRIi16, min_gather_offset); + LOG(PRIi16, max_gather_offset); +#undef LOG_STRUCT + +#define LOG_STRUCT limits + PL_INFO(gpu, " Limits:"); + // pl_gpu + LOG("d", thread_safe); + LOG("d", callbacks); + // pl_buf + LOG("zu", max_buf_size); + LOG("zu", max_ubo_size); + LOG("zu", max_ssbo_size); + LOG("zu", max_vbo_size); + LOG("zu", max_mapped_size); + LOG(PRIu64, max_buffer_texels); + LOG("zu", align_host_ptr); + LOG("d", host_cached); + // pl_tex + LOG(PRIu32, max_tex_1d_dim); + LOG(PRIu32, max_tex_2d_dim); + LOG(PRIu32, max_tex_3d_dim); + LOG("d", blittable_1d_3d); + LOG("d", buf_transfer); + LOG("zu", align_tex_xfer_pitch); + LOG("zu", align_tex_xfer_offset); + // pl_pass + LOG("zu", max_variable_comps); + LOG("zu", max_constants); + LOG("zu", max_pushc_size); + LOG("zu", align_vertex_stride); + if (gpu->glsl.compute) { + LOG(PRIu32, max_dispatch[0]); + LOG(PRIu32, max_dispatch[1]); + LOG(PRIu32, max_dispatch[2]); + } + LOG(PRIu32, fragment_queues); + LOG(PRIu32, compute_queues); +#undef LOG_STRUCT +#undef LOG + + if (pl_gpu_supports_interop(gpu)) { + PL_INFO(gpu, " External API interop:"); + + PL_INFO(gpu, " UUID: %s", PRINT_UUID(gpu->uuid)); + PL_INFO(gpu, " PCI: %04x:%02x:%02x:%x", + gpu->pci.domain, gpu->pci.bus, gpu->pci.device, gpu->pci.function); + PL_INFO(gpu, " buf export caps: 0x%x", + (unsigned int) gpu->export_caps.buf); + PL_INFO(gpu, " buf import caps: 0x%x", + (unsigned int) gpu->import_caps.buf); + PL_INFO(gpu, " tex export caps: 0x%x", + (unsigned int) gpu->export_caps.tex); + PL_INFO(gpu, " tex import caps: 0x%x", + (unsigned int) gpu->import_caps.tex); + PL_INFO(gpu, " sync export caps: 0x%x", + (unsigned int) gpu->export_caps.sync); + PL_INFO(gpu, " sync import caps: 0x%x", + (unsigned int) gpu->import_caps.sync); + } + + print_formats(gpu); + + // Finally, create a `pl_dispatch` object for internal operations + struct pl_gpu_fns *impl = PL_PRIV(gpu); + atomic_init(&impl->cache, NULL); + impl->dp = pl_dispatch_create(gpu->log, gpu); + return gpu; +} + +struct glsl_fmt { + enum pl_fmt_type type; + int num_components; + int depth[4]; + const char *glsl_format; +}; + +// List taken from the GLSL specification. (Yes, GLSL supports only exactly +// these formats with exactly these names) +static const struct glsl_fmt pl_glsl_fmts[] = { + {PL_FMT_FLOAT, 1, {16}, "r16f"}, + {PL_FMT_FLOAT, 1, {32}, "r32f"}, + {PL_FMT_FLOAT, 2, {16, 16}, "rg16f"}, + {PL_FMT_FLOAT, 2, {32, 32}, "rg32f"}, + {PL_FMT_FLOAT, 4, {16, 16, 16, 16}, "rgba16f"}, + {PL_FMT_FLOAT, 4, {32, 32, 32, 32}, "rgba32f"}, + {PL_FMT_FLOAT, 3, {11, 11, 10}, "r11f_g11f_b10f"}, + + {PL_FMT_UNORM, 1, {8}, "r8"}, + {PL_FMT_UNORM, 1, {16}, "r16"}, + {PL_FMT_UNORM, 2, {8, 8}, "rg8"}, + {PL_FMT_UNORM, 2, {16, 16}, "rg16"}, + {PL_FMT_UNORM, 4, {8, 8, 8, 8}, "rgba8"}, + {PL_FMT_UNORM, 4, {16, 16, 16, 16}, "rgba16"}, + {PL_FMT_UNORM, 4, {10, 10, 10, 2}, "rgb10_a2"}, + + {PL_FMT_SNORM, 1, {8}, "r8_snorm"}, + {PL_FMT_SNORM, 1, {16}, "r16_snorm"}, + {PL_FMT_SNORM, 2, {8, 8}, "rg8_snorm"}, + {PL_FMT_SNORM, 2, {16, 16}, "rg16_snorm"}, + {PL_FMT_SNORM, 4, {8, 8, 8, 8}, "rgba8_snorm"}, + {PL_FMT_SNORM, 4, {16, 16, 16, 16}, "rgba16_snorm"}, + + {PL_FMT_UINT, 1, {8}, "r8ui"}, + {PL_FMT_UINT, 1, {16}, "r16ui"}, + {PL_FMT_UINT, 1, {32}, "r32ui"}, + {PL_FMT_UINT, 2, {8, 8}, "rg8ui"}, + {PL_FMT_UINT, 2, {16, 16}, "rg16ui"}, + {PL_FMT_UINT, 2, {32, 32}, "rg32ui"}, + {PL_FMT_UINT, 4, {8, 8, 8, 8}, "rgba8ui"}, + {PL_FMT_UINT, 4, {16, 16, 16, 16}, "rgba16ui"}, + {PL_FMT_UINT, 4, {32, 32, 32, 32}, "rgba32ui"}, + {PL_FMT_UINT, 4, {10, 10, 10, 2}, "rgb10_a2ui"}, + + {PL_FMT_SINT, 1, {8}, "r8i"}, + {PL_FMT_SINT, 1, {16}, "r16i"}, + {PL_FMT_SINT, 1, {32}, "r32i"}, + {PL_FMT_SINT, 2, {8, 8}, "rg8i"}, + {PL_FMT_SINT, 2, {16, 16}, "rg16i"}, + {PL_FMT_SINT, 2, {32, 32}, "rg32i"}, + {PL_FMT_SINT, 4, {8, 8, 8, 8}, "rgba8i"}, + {PL_FMT_SINT, 4, {16, 16, 16, 16}, "rgba16i"}, + {PL_FMT_SINT, 4, {32, 32, 32, 32}, "rgba32i"}, +}; + +const char *pl_fmt_glsl_format(pl_fmt fmt, int components) +{ + if (fmt->opaque) + return NULL; + + for (int n = 0; n < PL_ARRAY_SIZE(pl_glsl_fmts); n++) { + const struct glsl_fmt *gfmt = &pl_glsl_fmts[n]; + + if (fmt->type != gfmt->type) + continue; + if (components != gfmt->num_components) + continue; + + // The component order is irrelevant, so we need to sort the depth + // based on the component's index + int depth[4] = {0}; + for (int i = 0; i < fmt->num_components; i++) + depth[fmt->sample_order[i]] = fmt->component_depth[i]; + + // Copy over any emulated components + for (int i = fmt->num_components; i < components; i++) + depth[i] = gfmt->depth[i]; + + for (int i = 0; i < PL_ARRAY_SIZE(depth); i++) { + if (depth[i] != gfmt->depth[i]) + goto next_fmt; + } + + return gfmt->glsl_format; + +next_fmt: ; // equivalent to `continue` + } + + return NULL; +} + +#define FOURCC(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \ + ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) + +struct pl_fmt_fourcc { + const char *name; + uint32_t fourcc; +}; + +static const struct pl_fmt_fourcc pl_fmt_fourccs[] = { + // 8 bpp red + {"r8", FOURCC('R','8',' ',' ')}, + // 16 bpp red + {"r16", FOURCC('R','1','6',' ')}, + // 16 bpp rg + {"rg8", FOURCC('G','R','8','8')}, + {"gr8", FOURCC('R','G','8','8')}, + // 32 bpp rg + {"rg16", FOURCC('G','R','3','2')}, + {"gr16", FOURCC('R','G','3','2')}, + // 8 bpp rgb: N/A + // 16 bpp rgb + {"argb4", FOURCC('B','A','1','2')}, + {"abgr4", FOURCC('R','A','1','2')}, + {"rgba4", FOURCC('A','B','1','2')}, + {"bgra4", FOURCC('A','R','1','2')}, + + {"a1rgb5", FOURCC('B','A','1','5')}, + {"a1bgr5", FOURCC('R','A','1','5')}, + {"rgb5a1", FOURCC('A','B','1','5')}, + {"bgr5a1", FOURCC('A','R','1','5')}, + + {"rgb565", FOURCC('B','G','1','6')}, + {"bgr565", FOURCC('R','G','1','6')}, + // 24 bpp rgb + {"rgb8", FOURCC('B','G','2','4')}, + {"bgr8", FOURCC('R','G','2','4')}, + // 32 bpp rgb + {"argb8", FOURCC('B','A','2','4')}, + {"abgr8", FOURCC('R','A','2','4')}, + {"rgba8", FOURCC('A','B','2','4')}, + {"bgra8", FOURCC('A','R','2','4')}, + + {"a2rgb10", FOURCC('B','A','3','0')}, + {"a2bgr10", FOURCC('R','A','3','0')}, + {"rgb10a2", FOURCC('A','B','3','0')}, + {"bgr10a2", FOURCC('A','R','3','0')}, + // 64bpp rgb + {"rgba16hf", FOURCC('A','B','4','H')}, + {"bgra16hf", FOURCC('A','R','4','H')}, + + // packed 16-bit formats + // rx10: N/A + // rxgx10: N/A + {"rxgxbxax10", FOURCC('A','B','1','0')}, + // rx12: N/A + // rxgx12: N/A + // rxgxbxax12: N/A + + // planar formats + {"g8_b8_r8_420", FOURCC('Y','U','1','2')}, + {"g8_b8_r8_422", FOURCC('Y','U','1','6')}, + {"g8_b8_r8_444", FOURCC('Y','U','2','4')}, + // g16_b18_r8_*: N/A + // gx10_bx10_rx10_42*: N/A + {"gx10_bx10_rx10_444", FOURCC('Q','4','1','0')}, + // gx12_bx12_rx12_*:N/A + {"g8_br8_420", FOURCC('N','V','1','2')}, + {"g8_br8_422", FOURCC('N','V','1','6')}, + {"g8_br8_444", FOURCC('N','V','2','4')}, + {"g16_br16_420", FOURCC('P','0','1','6')}, + // g16_br16_422: N/A + // g16_br16_444: N/A + {"gx10_bxrx10_420", FOURCC('P','0','1','0')}, + {"gx10_bxrx10_422", FOURCC('P','2','1','0')}, + // gx10_bxrx10_444: N/A + {"gx12_bxrx12_420", FOURCC('P','0','1','2')}, + // gx12_bxrx12_422: N/A + // gx12_bxrx12_444: N/A +}; + +uint32_t pl_fmt_fourcc(pl_fmt fmt) +{ + for (int n = 0; n < PL_ARRAY_SIZE(pl_fmt_fourccs); n++) { + const struct pl_fmt_fourcc *fourcc = &pl_fmt_fourccs[n]; + if (strcmp(fmt->name, fourcc->name) == 0) + return fourcc->fourcc; + } + + return 0; // no matching format +} + +size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par) +{ + int w = pl_rect_w(par->rc), h = pl_rect_h(par->rc), d = pl_rect_d(par->rc); + size_t pixel_pitch = par->tex->params.format->texel_size; + + // This generates the absolute bare minimum size of a buffer required to + // hold the data of a texture upload/download, by including stride padding + // only where strictly necessary. + return (d - 1) * par->depth_pitch + (h - 1) * par->row_pitch + w * pixel_pitch; +} + +int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt, + const struct pl_tex_transfer_params *params, + struct pl_tex_transfer_params **out_slices) +{ + PL_ARRAY(struct pl_tex_transfer_params) slices = {0}; + size_t max_size = params->buf ? gpu->limits.max_buf_size : SIZE_MAX; + + pl_fmt fmt = params->tex->params.format; + if (fmt->emulated && texel_fmt) { + size_t max_texel = gpu->limits.max_buffer_texels * texel_fmt->texel_size; + max_size = PL_MIN(gpu->limits.max_ssbo_size, max_texel); + } + + int slice_w = pl_rect_w(params->rc); + int slice_h = pl_rect_h(params->rc); + int slice_d = pl_rect_d(params->rc); + + slice_d = PL_MIN(slice_d, max_size / params->depth_pitch); + if (!slice_d) { + slice_d = 1; + slice_h = PL_MIN(slice_h, max_size / params->row_pitch); + if (!slice_h) { + slice_h = 1; + slice_w = PL_MIN(slice_w, max_size / fmt->texel_size); + pl_assert(slice_w); + } + } + + for (int z = 0; z < pl_rect_d(params->rc); z += slice_d) { + for (int y = 0; y < pl_rect_h(params->rc); y += slice_h) { + for (int x = 0; x < pl_rect_w(params->rc); x += slice_w) { + struct pl_tex_transfer_params slice = *params; + slice.callback = NULL; + slice.rc.x0 = params->rc.x0 + x; + slice.rc.y0 = params->rc.y0 + y; + slice.rc.z0 = params->rc.z0 + z; + slice.rc.x1 = PL_MIN(slice.rc.x0 + slice_w, params->rc.x1); + slice.rc.y1 = PL_MIN(slice.rc.y0 + slice_h, params->rc.y1); + slice.rc.z1 = PL_MIN(slice.rc.z0 + slice_d, params->rc.z1); + + const size_t offset = z * params->depth_pitch + + y * params->row_pitch + + x * fmt->texel_size; + if (slice.ptr) { + slice.ptr = (uint8_t *) slice.ptr + offset; + } else { + slice.buf_offset += offset; + } + + PL_ARRAY_APPEND(NULL, slices, slice); + } + } + } + + *out_slices = slices.elem; + return slices.num; +} + +bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + if (params->buf) + return pl_tex_upload(gpu, params); + + struct pl_buf_params bufparams = { + .size = pl_tex_transfer_size(params), + .debug_tag = PL_DEBUG_TAG, + }; + + struct pl_tex_transfer_params fixed = *params; + fixed.ptr = NULL; + + // If we can import host pointers directly, and the function is being used + // asynchronously, then we can use host pointer import to skip a memcpy. In + // the synchronous case, we still force a host memcpy to avoid stalling the + // host until the GPU memcpy completes. + bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR; + can_import &= !params->no_import; + can_import &= params->callback != NULL; + can_import &= bufparams.size > (32 << 10); // 32 KiB + if (can_import) { + bufparams.import_handle = PL_HANDLE_HOST_PTR; + bufparams.shared_mem = (struct pl_shared_mem) { + .handle.ptr = params->ptr, + .size = bufparams.size, + .offset = 0, + }; + + // Suppress errors for this test because it may fail, in which case we + // want to silently fall back. + pl_log_level_cap(gpu->log, PL_LOG_DEBUG); + fixed.buf = pl_buf_create(gpu, &bufparams); + pl_log_level_cap(gpu->log, PL_LOG_NONE); + } + + if (!fixed.buf) { + bufparams.import_handle = 0; + bufparams.host_writable = true; + fixed.buf = pl_buf_create(gpu, &bufparams); + if (!fixed.buf) + return false; + pl_buf_write(gpu, fixed.buf, 0, params->ptr, bufparams.size); + if (params->callback) + params->callback(params->priv); + fixed.callback = NULL; + } + + bool ok = pl_tex_upload(gpu, &fixed); + pl_buf_destroy(gpu, &fixed.buf); + return ok; +} + +struct pbo_cb_ctx { + pl_gpu gpu; + pl_buf buf; + void *ptr; + void (*callback)(void *priv); + void *priv; +}; + +static void pbo_download_cb(void *priv) +{ + struct pbo_cb_ctx *p = priv; + pl_buf_read(p->gpu, p->buf, 0, p->ptr, p->buf->params.size); + pl_buf_destroy(p->gpu, &p->buf); + + // Run the original callback + p->callback(p->priv); + pl_free(priv); +}; + +bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + if (params->buf) + return pl_tex_download(gpu, params); + + pl_buf buf = NULL; + struct pl_buf_params bufparams = { + .size = pl_tex_transfer_size(params), + .debug_tag = PL_DEBUG_TAG, + }; + + // If we can import host pointers directly, we can avoid an extra memcpy + // (sometimes). In the cases where it isn't avoidable, the extra memcpy + // will happen inside VRAM, which is typically faster anyway. + bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR; + can_import &= !params->no_import; + can_import &= bufparams.size > (32 << 10); // 32 KiB + if (can_import) { + bufparams.import_handle = PL_HANDLE_HOST_PTR; + bufparams.shared_mem = (struct pl_shared_mem) { + .handle.ptr = params->ptr, + .size = bufparams.size, + .offset = 0, + }; + + // Suppress errors for this test because it may fail, in which case we + // want to silently fall back. + pl_log_level_cap(gpu->log, PL_LOG_DEBUG); + buf = pl_buf_create(gpu, &bufparams); + pl_log_level_cap(gpu->log, PL_LOG_NONE); + } + + if (!buf) { + // Fallback when host pointer import is not supported + bufparams.import_handle = 0; + bufparams.host_readable = true; + buf = pl_buf_create(gpu, &bufparams); + } + + if (!buf) + return false; + + struct pl_tex_transfer_params newparams = *params; + newparams.ptr = NULL; + newparams.buf = buf; + + // If the transfer is asynchronous, propagate our host read asynchronously + if (params->callback && !bufparams.import_handle) { + newparams.callback = pbo_download_cb; + newparams.priv = pl_alloc_struct(NULL, struct pbo_cb_ctx, { + .gpu = gpu, + .buf = buf, + .ptr = params->ptr, + .callback = params->callback, + .priv = params->priv, + }); + } + + if (!pl_tex_download(gpu, &newparams)) { + pl_buf_destroy(gpu, &buf); + return false; + } + + if (!params->callback) { + while (pl_buf_poll(gpu, buf, 10000000)) // 10 ms + PL_TRACE(gpu, "pl_tex_download: synchronous/blocking (slow path)"); + } + + bool ok; + if (bufparams.import_handle) { + // Buffer download completion already means the host pointer contains + // the valid data, no more need to copy. (Note: this applies even for + // asynchronous downloads) + ok = true; + pl_buf_destroy(gpu, &buf); + } else if (!params->callback) { + // Synchronous read back to the host pointer + ok = pl_buf_read(gpu, buf, 0, params->ptr, bufparams.size); + pl_buf_destroy(gpu, &buf); + } else { + // Nothing left to do here, the rest will be done by pbo_download_cb + ok = true; + } + + return ok; +} + +bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + const int threads = PL_MIN(256, pl_rect_w(params->rc)); + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_require(gpu, params->buf); + + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, threads, 1, false, 0)) { + PL_ERR(gpu, "Failed emulating texture transfer!"); + pl_dispatch_abort(dp, &sh); + return false; + } + + ident_t buf = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->buf, + .desc = { + .name = "data", + .type = PL_DESC_BUF_TEXEL_STORAGE, + }, + }); + + ident_t img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = { + .name = "image", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + // If the transfer width is a natural multiple of the thread size, we + // can skip the bounds check. Otherwise, make sure we aren't blitting out + // of the range since this would read out of bounds. + int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads); + if (groups_x * threads != pl_rect_w(params->rc)) { + GLSL("if (gl_GlobalInvocationID.x >= %d) \n" + " return; \n", + pl_rect_w(params->rc)); + } + + // fmt->texel_align contains the size of an individual color value + assert(fmt->texel_size == fmt->num_components * fmt->texel_align); + GLSL("vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \n" + "ivec3 pos = ivec3(gl_GlobalInvocationID); \n" + "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n" + "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n", + SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0), + SH_INT_DYN(params->buf_offset), + SH_INT(params->depth_pitch / fmt->texel_align), + SH_INT(params->row_pitch / fmt->texel_align), + SH_INT(fmt->texel_size / fmt->texel_align)); + + for (int i = 0; i < fmt->num_components; i++) + GLSL("color[%d] = imageLoad("$", base + %d).r; \n", i, buf, i); + + int dims = pl_tex_params_dimension(tex->params); + static const char *coord_types[] = { + [1] = "int", + [2] = "ivec2", + [3] = "ivec3", + }; + + GLSL("imageStore("$", %s(tex_pos), color);\n", img, coord_types[dims]); + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + groups_x, + pl_rect_h(params->rc), + pl_rect_d(params->rc), + }, + )); + +error: + return false; +} + +bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + const int threads = PL_MIN(256, pl_rect_w(params->rc)); + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_require(gpu, params->buf); + + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, threads, 1, false, 0)) { + PL_ERR(gpu, "Failed emulating texture transfer!"); + pl_dispatch_abort(dp, &sh); + return false; + } + + ident_t buf = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->buf, + .desc = { + .name = "data", + .type = PL_DESC_BUF_TEXEL_STORAGE, + }, + }); + + ident_t img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = { + .name = "image", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_READONLY, + }, + }); + + int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads); + if (groups_x * threads != pl_rect_w(params->rc)) { + GLSL("if (gl_GlobalInvocationID.x >= %d) \n" + " return; \n", + pl_rect_w(params->rc)); + } + + int dims = pl_tex_params_dimension(tex->params); + static const char *coord_types[] = { + [1] = "int", + [2] = "ivec2", + [3] = "ivec3", + }; + + assert(fmt->texel_size == fmt->num_components * fmt->texel_align); + GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n" + "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n" + "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n" + "vec4 color = imageLoad("$", %s(tex_pos)); \n", + SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0), + SH_INT_DYN(params->buf_offset), + SH_INT(params->depth_pitch / fmt->texel_align), + SH_INT(params->row_pitch / fmt->texel_align), + SH_INT(fmt->texel_size / fmt->texel_align), + img, coord_types[dims]); + + for (int i = 0; i < fmt->num_components; i++) + GLSL("imageStore("$", base + %d, vec4(color[%d])); \n", buf, i, i); + + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + groups_x, + pl_rect_h(params->rc), + pl_rect_d(params->rc), + }, + )); + +error: + return false; +} + +bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + if (!params->dst->params.storable) + return false; + + // Normalize `dst_rc`, moving all flipping to `src_rc` instead. + pl_rect3d src_rc = params->src_rc; + pl_rect3d dst_rc = params->dst_rc; + if (pl_rect_w(dst_rc) < 0) { + PL_SWAP(src_rc.x0, src_rc.x1); + PL_SWAP(dst_rc.x0, dst_rc.x1); + } + if (pl_rect_h(dst_rc) < 0) { + PL_SWAP(src_rc.y0, src_rc.y1); + PL_SWAP(dst_rc.y0, dst_rc.y1); + } + if (pl_rect_d(dst_rc) < 0) { + PL_SWAP(src_rc.z0, src_rc.z1); + PL_SWAP(dst_rc.z0, dst_rc.z1); + } + + bool needs_scaling = false; + needs_scaling |= pl_rect_w(dst_rc) != abs(pl_rect_w(src_rc)); + needs_scaling |= pl_rect_h(dst_rc) != abs(pl_rect_h(src_rc)); + needs_scaling |= pl_rect_d(dst_rc) != abs(pl_rect_d(src_rc)); + + // Exception: fast path for 1-pixel blits, which don't require scaling + bool is_1pixel = abs(pl_rect_w(src_rc)) == 1 && abs(pl_rect_h(src_rc)) == 1; + needs_scaling &= !is_1pixel; + + // Manual trilinear interpolation would be too slow to justify + bool needs_sampling = needs_scaling && params->sample_mode != PL_TEX_SAMPLE_NEAREST; + needs_sampling |= !params->src->params.storable; + if (needs_sampling && !params->src->params.sampleable) + return false; + + const int threads = 256; + int bw = PL_MIN(32, pl_rect_w(dst_rc)); + int bh = PL_MIN(threads / bw, pl_rect_h(dst_rc)); + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, bw, bh, false, 0)) { + pl_dispatch_abort(dp, &sh); + return false; + } + + // Avoid over-writing into `dst` + int groups_x = PL_DIV_UP(pl_rect_w(dst_rc), bw); + if (groups_x * bw != pl_rect_w(dst_rc)) { + GLSL("if (gl_GlobalInvocationID.x >= %d) \n" + " return; \n", + pl_rect_w(dst_rc)); + } + + int groups_y = PL_DIV_UP(pl_rect_h(dst_rc), bh); + if (groups_y * bh != pl_rect_h(dst_rc)) { + GLSL("if (gl_GlobalInvocationID.y >= %d) \n" + " return; \n", + pl_rect_h(dst_rc)); + } + + ident_t dst = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->dst, + .desc = { + .name = "dst", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + static const char *vecs[] = { + [1] = "float", + [2] = "vec2", + [3] = "vec3", + [4] = "vec4", + }; + + static const char *ivecs[] = { + [1] = "int", + [2] = "ivec2", + [3] = "ivec3", + [4] = "ivec4", + }; + + int src_dims = pl_tex_params_dimension(params->src->params); + int dst_dims = pl_tex_params_dimension(params->dst->params); + GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n" + "%s dst_pos = %s(pos + ivec3(%d, %d, %d)); \n", + ivecs[dst_dims], ivecs[dst_dims], + params->dst_rc.x0, params->dst_rc.y0, params->dst_rc.z0); + + if (needs_sampling || (needs_scaling && params->src->params.sampleable)) { + + ident_t src = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "src", + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = params->src, + .address_mode = PL_TEX_ADDRESS_CLAMP, + .sample_mode = params->sample_mode, + } + }); + + if (is_1pixel) { + GLSL("%s fpos = %s(0.5); \n", vecs[src_dims], vecs[src_dims]); + } else { + GLSL("vec3 fpos = (vec3(pos) + vec3(0.5)) / vec3(%d.0, %d.0, %d.0); \n", + pl_rect_w(dst_rc), pl_rect_h(dst_rc), pl_rect_d(dst_rc)); + } + + GLSL("%s src_pos = %s(0.5); \n" + "src_pos.x = mix(%f, %f, fpos.x); \n", + vecs[src_dims], vecs[src_dims], + (float) src_rc.x0 / params->src->params.w, + (float) src_rc.x1 / params->src->params.w); + + if (params->src->params.h) { + GLSL("src_pos.y = mix(%f, %f, fpos.y); \n", + (float) src_rc.y0 / params->src->params.h, + (float) src_rc.y1 / params->src->params.h); + } + + if (params->src->params.d) { + GLSL("src_pos.z = mix(%f, %f, fpos.z); \n", + (float) src_rc.z0 / params->src->params.d, + (float) src_rc.z1 / params->src->params.d); + } + + GLSL("imageStore("$", dst_pos, textureLod("$", src_pos, 0.0)); \n", + dst, src); + + } else { + + ident_t src = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->src, + .desc = { + .name = "src", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_READONLY, + }, + }); + + if (is_1pixel) { + GLSL("ivec3 src_pos = ivec3(0); \n"); + } else if (needs_scaling) { + GLSL("ivec3 src_pos = ivec3(vec3(%f, %f, %f) * vec3(pos)); \n", + fabs((float) pl_rect_w(src_rc) / pl_rect_w(dst_rc)), + fabs((float) pl_rect_h(src_rc) / pl_rect_h(dst_rc)), + fabs((float) pl_rect_d(src_rc) / pl_rect_d(dst_rc))); + } else { + GLSL("ivec3 src_pos = pos; \n"); + } + + GLSL("src_pos = ivec3(%d, %d, %d) * src_pos + ivec3(%d, %d, %d); \n" + "imageStore("$", dst_pos, imageLoad("$", %s(src_pos))); \n", + src_rc.x1 < src_rc.x0 ? -1 : 1, + src_rc.y1 < src_rc.y0 ? -1 : 1, + src_rc.z1 < src_rc.z0 ? -1 : 1, + src_rc.x0, src_rc.y0, src_rc.z0, + dst, src, ivecs[src_dims]); + + } + + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + groups_x, + groups_y, + pl_rect_d(dst_rc), + }, + )); +} + +void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + enum pl_fmt_type src_type = params->src->params.format->type; + enum pl_fmt_type dst_type = params->dst->params.format->type; + + // Only for 2D textures + pl_assert(params->src->params.h && !params->src->params.d); + pl_assert(params->dst->params.h && !params->dst->params.d); + + // Integer textures are not supported + pl_assert(src_type != PL_FMT_UINT && src_type != PL_FMT_SINT); + pl_assert(dst_type != PL_FMT_UINT && dst_type != PL_FMT_SINT); + + pl_rect2df src_rc = { + .x0 = params->src_rc.x0, .x1 = params->src_rc.x1, + .y0 = params->src_rc.y0, .y1 = params->src_rc.y1, + }; + pl_rect2d dst_rc = { + .x0 = params->dst_rc.x0, .x1 = params->dst_rc.x1, + .y0 = params->dst_rc.y0, .y1 = params->dst_rc.y1, + }; + + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + sh->output = PL_SHADER_SIG_COLOR; + + ident_t pos, src = sh_bind(sh, params->src, PL_TEX_ADDRESS_CLAMP, + params->sample_mode, "src_tex", &src_rc, &pos, NULL); + + GLSL("vec4 color = textureLod("$", "$", 0.0); \n", src, pos); + + pl_dispatch_finish(dp, pl_dispatch_params( + .shader = &sh, + .target = params->dst, + .rect = dst_rc, + )); +} + +bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params) +{ + pl_buf src = params->src, dst = params->dst; + pl_require(gpu, src->params.storable && dst->params.storable); + pl_require(gpu, params->src_offset % sizeof(unsigned) == 0); + pl_require(gpu, params->dst_offset % sizeof(unsigned) == 0); + pl_require(gpu, params->src_offset + params->size <= src->params.size); + pl_require(gpu, params->dst_offset + params->size <= dst->params.size); + pl_require(gpu, src != dst || params->src_offset == params->dst_offset); + pl_require(gpu, params->size % sizeof(unsigned) == 0); + pl_require(gpu, params->wordsize == sizeof(uint16_t) || + params->wordsize == sizeof(uint32_t)); + + const size_t words = params->size / sizeof(unsigned); + const size_t src_off = params->src_offset / sizeof(unsigned); + const size_t dst_off = params->dst_offset / sizeof(unsigned); + + const int threads = PL_MIN(256, words); + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, threads, 1, false, 0)) { + pl_dispatch_abort(dp, &sh); + return false; + } + + const size_t groups = PL_DIV_UP(words, threads); + if (groups * threads > words) { + GLSL("if (gl_GlobalInvocationID.x >= %zu) \n" + " return; \n", + words); + } + + sh_desc(sh, (struct pl_shader_desc) { + .binding.object = src, + .desc = { + .name = "SrcBuf", + .type = PL_DESC_BUF_STORAGE, + .access = src == dst ? PL_DESC_ACCESS_READWRITE : PL_DESC_ACCESS_READONLY, + }, + .num_buffer_vars = 1, + .buffer_vars = &(struct pl_buffer_var) { + .var = { + .name = "src", + .type = PL_VAR_UINT, + .dim_v = 1, + .dim_m = 1, + .dim_a = src_off + words, + }, + }, + }); + + if (src != dst) { + sh_desc(sh, (struct pl_shader_desc) { + .binding.object = dst, + .desc = { + .name = "DstBuf", + .type = PL_DESC_BUF_STORAGE, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + .num_buffer_vars = 1, + .buffer_vars = &(struct pl_buffer_var) { + .var = { + .name = "dst", + .type = PL_VAR_UINT, + .dim_v = 1, + .dim_m = 1, + .dim_a = dst_off + words, + }, + }, + }); + } else { + GLSL("#define dst src \n"); + } + + GLSL("// pl_buf_copy_swap \n" + "{ \n" + "uint word = src["$" + gl_GlobalInvocationID.x]; \n" + "word = (word & 0xFF00FF00u) >> 8 | \n" + " (word & 0x00FF00FFu) << 8; \n", + SH_UINT(src_off)); + if (params->wordsize > 2) { + GLSL("word = (word & 0xFFFF0000u) >> 16 | \n" + " (word & 0x0000FFFFu) << 16; \n"); + } + GLSL("dst["$" + gl_GlobalInvocationID.x] = word; \n" + "} \n", + SH_UINT(dst_off)); + + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = {groups, 1, 1}, + )); + +error: + if (src->params.debug_tag || dst->params.debug_tag) { + PL_ERR(gpu, " for buffers: src %s, dst %s", + src->params.debug_tag, dst->params.debug_tag); + } + return false; +} + +void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + if (!params->vertex_data && !params->index_data) + return pl_pass_run(gpu, params); + + struct pl_pass_run_params newparams = *params; + pl_buf vert = NULL, index = NULL; + + if (params->vertex_data) { + vert = pl_buf_create(gpu, pl_buf_params( + .size = pl_vertex_buf_size(params), + .initial_data = params->vertex_data, + .drawable = true, + )); + + if (!vert) { + PL_ERR(gpu, "Failed allocating vertex buffer!"); + return; + } + + newparams.vertex_buf = vert; + newparams.vertex_data = NULL; + } + + if (params->index_data) { + index = pl_buf_create(gpu, pl_buf_params( + .size = pl_index_buf_size(params), + .initial_data = params->index_data, + .drawable = true, + )); + + if (!index) { + PL_ERR(gpu, "Failed allocating index buffer!"); + return; + } + + newparams.index_buf = index; + newparams.index_data = NULL; + } + + pl_pass_run(gpu, &newparams); + pl_buf_destroy(gpu, &vert); + pl_buf_destroy(gpu, &index); +} + +struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params) +{ + struct pl_pass_params new = *params; + + new.glsl_shader = pl_str0dup0(alloc, new.glsl_shader); + new.vertex_shader = pl_str0dup0(alloc, new.vertex_shader); + if (new.blend_params) + new.blend_params = pl_memdup_ptr(alloc, new.blend_params); + +#define DUPNAMES(field) \ + do { \ + size_t _size = new.num_##field * sizeof(new.field[0]); \ + new.field = pl_memdup(alloc, new.field, _size); \ + for (int j = 0; j < new.num_##field; j++) \ + new.field[j].name = pl_str0dup0(alloc, new.field[j].name); \ + } while (0) + + DUPNAMES(variables); + DUPNAMES(descriptors); + DUPNAMES(vertex_attribs); + +#undef DUPNAMES + + new.constant_data = NULL; + new.constants = pl_memdup(alloc, new.constants, + new.num_constants * sizeof(new.constants[0])); + + return new; +} + +size_t pl_vertex_buf_size(const struct pl_pass_run_params *params) +{ + if (!params->index_data) + return params->vertex_count * params->pass->params.vertex_stride; + + int num_vertices = 0; + const void *idx = params->index_data; + switch (params->index_fmt) { + case PL_INDEX_UINT16: + for (int i = 0; i < params->vertex_count; i++) + num_vertices = PL_MAX(num_vertices, ((const uint16_t *) idx)[i]); + break; + case PL_INDEX_UINT32: + for (int i = 0; i < params->vertex_count; i++) + num_vertices = PL_MAX(num_vertices, ((const uint32_t *) idx)[i]); + break; + case PL_INDEX_FORMAT_COUNT: pl_unreachable(); + } + + return (num_vertices + 1) * params->pass->params.vertex_stride; +} + +const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]) +{ + static const char *hexdigits = "0123456789ABCDEF"; + for (int i = 0; i < UUID_SIZE; i++) { + uint8_t x = uuid[i]; + buf[3 * i + 0] = hexdigits[x >> 4]; + buf[3 * i + 1] = hexdigits[x & 0xF]; + buf[3 * i + 2] = i == UUID_SIZE - 1 ? '\0' : ':'; + } + + return buf; +} + +const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod) +{ + switch (mod) { + case DRM_FORMAT_MOD_LINEAR: return "LINEAR"; + case DRM_FORMAT_MOD_INVALID: return "INVALID"; + } + + uint8_t vendor = mod >> 56; + uint64_t val = mod & ((1ULL << 56) - 1); + + const char *name = NULL; + switch (vendor) { + case 0x00: name = "NONE"; break; + case 0x01: name = "INTEL"; break; + case 0x02: name = "AMD"; break; + case 0x03: name = "NVIDIA"; break; + case 0x04: name = "SAMSUNG"; break; + case 0x08: name = "ARM"; break; + } + + if (name) { + snprintf(buf, DRM_MOD_SIZE, "%s 0x%"PRIx64, name, val); + } else { + snprintf(buf, DRM_MOD_SIZE, "0x%02x 0x%"PRIx64, vendor, val); + } + + return buf; +} |