/* * This file is part of libplacebo. * * libplacebo is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * libplacebo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with libplacebo. If not, see . */ #include #include "common.h" #include "shaders.h" #include "gpu.h" // GPU-internal helpers static int cmp_fmt(const void *pa, const void *pb) { pl_fmt a = *(pl_fmt *)pa; pl_fmt b = *(pl_fmt *)pb; // Always prefer non-opaque formats if (a->opaque != b->opaque) return PL_CMP(a->opaque, b->opaque); // Always prefer non-emulated formats if (a->emulated != b->emulated) return PL_CMP(a->emulated, b->emulated); int ca = __builtin_popcount(a->caps), cb = __builtin_popcount(b->caps); if (ca != cb) return -PL_CMP(ca, cb); // invert to sort higher values first // If the population count is the same but the caps are different, prefer // the caps with a "lower" value (which tend to be more fundamental caps) if (a->caps != b->caps) return PL_CMP(a->caps, b->caps); // If the capabilities are equal, sort based on the component attributes for (int i = 0; i < PL_ARRAY_SIZE(a->component_depth); i++) { int da = a->component_depth[i], db = b->component_depth[i]; if (da != db) return PL_CMP(da, db); int ha = a->host_bits[i], hb = b->host_bits[i]; if (ha != hb) return PL_CMP(ha, hb); int oa = a->sample_order[i], ob = b->sample_order[i]; if (oa != ob) return PL_CMP(oa, ob); } // Fall back to sorting by the name (for stability) return strcmp(a->name, b->name); } #define FMT_BOOL(letter, cap) ((cap) ? (letter) : '-') #define FMT_IDX4(f) (f)[0], (f)[1], (f)[2], (f)[3] static void print_formats(pl_gpu gpu) { if (!pl_msg_test(gpu->log, PL_LOG_DEBUG)) return; #define CAP_HEADER "%-12s" #define CAP_FIELDS "%c%c%c%c%c%c%c%c%c%c%c%c" #define CAP_VALUES \ FMT_BOOL('S', fmt->caps & PL_FMT_CAP_SAMPLEABLE), \ FMT_BOOL('s', fmt->caps & PL_FMT_CAP_STORABLE), \ FMT_BOOL('L', fmt->caps & PL_FMT_CAP_LINEAR), \ FMT_BOOL('R', fmt->caps & PL_FMT_CAP_RENDERABLE), \ FMT_BOOL('b', fmt->caps & PL_FMT_CAP_BLENDABLE), \ FMT_BOOL('B', fmt->caps & PL_FMT_CAP_BLITTABLE), \ FMT_BOOL('V', fmt->caps & PL_FMT_CAP_VERTEX), \ FMT_BOOL('u', fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM), \ FMT_BOOL('t', fmt->caps & PL_FMT_CAP_TEXEL_STORAGE), \ FMT_BOOL('H', fmt->caps & PL_FMT_CAP_HOST_READABLE), \ FMT_BOOL('W', fmt->caps & PL_FMT_CAP_READWRITE), \ FMT_BOOL('G', fmt->gatherable) PL_DEBUG(gpu, "GPU texture formats:"); PL_DEBUG(gpu, " %-20s %-6s %-4s %-4s " CAP_HEADER " %-3s %-13s %-13s %-10s %-10s %-6s", "NAME", "TYPE", "SIZE", "COMP", "CAPS", "EMU", "DEPTH", "HOST_BITS", "GLSL_TYPE", "GLSL_FMT", "FOURCC"); for (int n = 0; n < gpu->num_formats; n++) { pl_fmt fmt = gpu->formats[n]; static const char *types[] = { [PL_FMT_UNKNOWN] = "UNKNOWN", [PL_FMT_UNORM] = "UNORM", [PL_FMT_SNORM] = "SNORM", [PL_FMT_UINT] = "UINT", [PL_FMT_SINT] = "SINT", [PL_FMT_FLOAT] = "FLOAT", }; static const char idx_map[4] = {'R', 'G', 'B', 'A'}; char indices[4] = {' ', ' ', ' ', ' '}; if (!fmt->opaque) { for (int i = 0; i < fmt->num_components; i++) indices[i] = idx_map[fmt->sample_order[i]]; } PL_DEBUG(gpu, " %-20s %-6s %-4zu %c%c%c%c " CAP_FIELDS " %-3s " "{%-2d %-2d %-2d %-2d} {%-2d %-2d %-2d %-2d} %-10s %-10s %-6s", fmt->name, types[fmt->type], fmt->texel_size, FMT_IDX4(indices), CAP_VALUES, fmt->emulated ? "y" : "n", FMT_IDX4(fmt->component_depth), FMT_IDX4(fmt->host_bits), PL_DEF(fmt->glsl_type, ""), PL_DEF(fmt->glsl_format, ""), PRINT_FOURCC(fmt->fourcc)); #undef CAP_HEADER #undef CAP_FIELDS #undef CAP_VALUES for (int i = 0; i < fmt->num_modifiers; i++) { PL_TRACE(gpu, " modifiers[%d]: %s", i, PRINT_DRM_MOD(fmt->modifiers[i])); } } } pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu) { // Sort formats qsort(gpu->formats, gpu->num_formats, sizeof(pl_fmt), cmp_fmt); // Verification pl_assert(gpu->limits.max_tex_2d_dim); pl_assert(gpu->limits.max_variable_comps || gpu->limits.max_ubo_size); pl_assert(gpu->limits.max_ubo_size <= gpu->limits.max_buf_size); pl_assert(gpu->limits.max_ssbo_size <= gpu->limits.max_buf_size); pl_assert(gpu->limits.max_vbo_size <= gpu->limits.max_buf_size); pl_assert(gpu->limits.max_mapped_size <= gpu->limits.max_buf_size); for (int n = 0; n < gpu->num_formats; n++) { pl_fmt fmt = gpu->formats[n]; pl_assert(fmt->name); pl_assert(fmt->type); pl_assert(fmt->num_components); pl_assert(fmt->internal_size); pl_assert(fmt->opaque ? !fmt->texel_size : fmt->texel_size); pl_assert(!fmt->gatherable || (fmt->caps & PL_FMT_CAP_SAMPLEABLE)); for (int i = 0; i < fmt->num_components; i++) { pl_assert(fmt->component_depth[i]); pl_assert(fmt->opaque ? !fmt->host_bits[i] : fmt->host_bits[i]); } for (int i = 0; i < fmt->num_planes; i++) pl_assert(fmt->planes[i].format); enum pl_fmt_caps texel_caps = PL_FMT_CAP_VERTEX | PL_FMT_CAP_TEXEL_UNIFORM | PL_FMT_CAP_TEXEL_STORAGE; if (fmt->caps & texel_caps) { pl_assert(fmt->glsl_type); pl_assert(!fmt->opaque); } if (!fmt->opaque) { pl_assert(fmt->texel_size && fmt->texel_align); pl_assert((fmt->texel_size % fmt->texel_align) == 0); pl_assert(fmt->internal_size == fmt->texel_size || fmt->emulated); } else { pl_assert(!fmt->texel_size && !fmt->texel_align); pl_assert(!(fmt->caps & PL_FMT_CAP_HOST_READABLE)); } // Assert uniqueness of name for (int o = n + 1; o < gpu->num_formats; o++) pl_assert(strcmp(fmt->name, gpu->formats[o]->name) != 0); } // Print info PL_INFO(gpu, "GPU information:"); #define LOG(fmt, field) \ PL_INFO(gpu, " %-26s %" fmt, #field ":", gpu->LOG_STRUCT.field) #define LOG_STRUCT glsl PL_INFO(gpu, " GLSL version: %d%s", gpu->glsl.version, gpu->glsl.vulkan ? " (vulkan)" : gpu->glsl.gles ? " es" : ""); if (gpu->glsl.compute) { LOG("zu", max_shmem_size); LOG(PRIu32, max_group_threads); LOG(PRIu32, max_group_size[0]); LOG(PRIu32, max_group_size[1]); LOG(PRIu32, max_group_size[2]); } LOG(PRIu32, subgroup_size); LOG(PRIi16, min_gather_offset); LOG(PRIi16, max_gather_offset); #undef LOG_STRUCT #define LOG_STRUCT limits PL_INFO(gpu, " Limits:"); // pl_gpu LOG("d", thread_safe); LOG("d", callbacks); // pl_buf LOG("zu", max_buf_size); LOG("zu", max_ubo_size); LOG("zu", max_ssbo_size); LOG("zu", max_vbo_size); LOG("zu", max_mapped_size); LOG(PRIu64, max_buffer_texels); LOG("zu", align_host_ptr); LOG("d", host_cached); // pl_tex LOG(PRIu32, max_tex_1d_dim); LOG(PRIu32, max_tex_2d_dim); LOG(PRIu32, max_tex_3d_dim); LOG("d", blittable_1d_3d); LOG("d", buf_transfer); LOG("zu", align_tex_xfer_pitch); LOG("zu", align_tex_xfer_offset); // pl_pass LOG("zu", max_variable_comps); LOG("zu", max_constants); LOG("zu", max_pushc_size); LOG("zu", align_vertex_stride); if (gpu->glsl.compute) { LOG(PRIu32, max_dispatch[0]); LOG(PRIu32, max_dispatch[1]); LOG(PRIu32, max_dispatch[2]); } LOG(PRIu32, fragment_queues); LOG(PRIu32, compute_queues); #undef LOG_STRUCT #undef LOG if (pl_gpu_supports_interop(gpu)) { PL_INFO(gpu, " External API interop:"); PL_INFO(gpu, " UUID: %s", PRINT_UUID(gpu->uuid)); PL_INFO(gpu, " PCI: %04x:%02x:%02x:%x", gpu->pci.domain, gpu->pci.bus, gpu->pci.device, gpu->pci.function); PL_INFO(gpu, " buf export caps: 0x%x", (unsigned int) gpu->export_caps.buf); PL_INFO(gpu, " buf import caps: 0x%x", (unsigned int) gpu->import_caps.buf); PL_INFO(gpu, " tex export caps: 0x%x", (unsigned int) gpu->export_caps.tex); PL_INFO(gpu, " tex import caps: 0x%x", (unsigned int) gpu->import_caps.tex); PL_INFO(gpu, " sync export caps: 0x%x", (unsigned int) gpu->export_caps.sync); PL_INFO(gpu, " sync import caps: 0x%x", (unsigned int) gpu->import_caps.sync); } print_formats(gpu); // Finally, create a `pl_dispatch` object for internal operations struct pl_gpu_fns *impl = PL_PRIV(gpu); atomic_init(&impl->cache, NULL); impl->dp = pl_dispatch_create(gpu->log, gpu); return gpu; } struct glsl_fmt { enum pl_fmt_type type; int num_components; int depth[4]; const char *glsl_format; }; // List taken from the GLSL specification. (Yes, GLSL supports only exactly // these formats with exactly these names) static const struct glsl_fmt pl_glsl_fmts[] = { {PL_FMT_FLOAT, 1, {16}, "r16f"}, {PL_FMT_FLOAT, 1, {32}, "r32f"}, {PL_FMT_FLOAT, 2, {16, 16}, "rg16f"}, {PL_FMT_FLOAT, 2, {32, 32}, "rg32f"}, {PL_FMT_FLOAT, 4, {16, 16, 16, 16}, "rgba16f"}, {PL_FMT_FLOAT, 4, {32, 32, 32, 32}, "rgba32f"}, {PL_FMT_FLOAT, 3, {11, 11, 10}, "r11f_g11f_b10f"}, {PL_FMT_UNORM, 1, {8}, "r8"}, {PL_FMT_UNORM, 1, {16}, "r16"}, {PL_FMT_UNORM, 2, {8, 8}, "rg8"}, {PL_FMT_UNORM, 2, {16, 16}, "rg16"}, {PL_FMT_UNORM, 4, {8, 8, 8, 8}, "rgba8"}, {PL_FMT_UNORM, 4, {16, 16, 16, 16}, "rgba16"}, {PL_FMT_UNORM, 4, {10, 10, 10, 2}, "rgb10_a2"}, {PL_FMT_SNORM, 1, {8}, "r8_snorm"}, {PL_FMT_SNORM, 1, {16}, "r16_snorm"}, {PL_FMT_SNORM, 2, {8, 8}, "rg8_snorm"}, {PL_FMT_SNORM, 2, {16, 16}, "rg16_snorm"}, {PL_FMT_SNORM, 4, {8, 8, 8, 8}, "rgba8_snorm"}, {PL_FMT_SNORM, 4, {16, 16, 16, 16}, "rgba16_snorm"}, {PL_FMT_UINT, 1, {8}, "r8ui"}, {PL_FMT_UINT, 1, {16}, "r16ui"}, {PL_FMT_UINT, 1, {32}, "r32ui"}, {PL_FMT_UINT, 2, {8, 8}, "rg8ui"}, {PL_FMT_UINT, 2, {16, 16}, "rg16ui"}, {PL_FMT_UINT, 2, {32, 32}, "rg32ui"}, {PL_FMT_UINT, 4, {8, 8, 8, 8}, "rgba8ui"}, {PL_FMT_UINT, 4, {16, 16, 16, 16}, "rgba16ui"}, {PL_FMT_UINT, 4, {32, 32, 32, 32}, "rgba32ui"}, {PL_FMT_UINT, 4, {10, 10, 10, 2}, "rgb10_a2ui"}, {PL_FMT_SINT, 1, {8}, "r8i"}, {PL_FMT_SINT, 1, {16}, "r16i"}, {PL_FMT_SINT, 1, {32}, "r32i"}, {PL_FMT_SINT, 2, {8, 8}, "rg8i"}, {PL_FMT_SINT, 2, {16, 16}, "rg16i"}, {PL_FMT_SINT, 2, {32, 32}, "rg32i"}, {PL_FMT_SINT, 4, {8, 8, 8, 8}, "rgba8i"}, {PL_FMT_SINT, 4, {16, 16, 16, 16}, "rgba16i"}, {PL_FMT_SINT, 4, {32, 32, 32, 32}, "rgba32i"}, }; const char *pl_fmt_glsl_format(pl_fmt fmt, int components) { if (fmt->opaque) return NULL; for (int n = 0; n < PL_ARRAY_SIZE(pl_glsl_fmts); n++) { const struct glsl_fmt *gfmt = &pl_glsl_fmts[n]; if (fmt->type != gfmt->type) continue; if (components != gfmt->num_components) continue; // The component order is irrelevant, so we need to sort the depth // based on the component's index int depth[4] = {0}; for (int i = 0; i < fmt->num_components; i++) depth[fmt->sample_order[i]] = fmt->component_depth[i]; // Copy over any emulated components for (int i = fmt->num_components; i < components; i++) depth[i] = gfmt->depth[i]; for (int i = 0; i < PL_ARRAY_SIZE(depth); i++) { if (depth[i] != gfmt->depth[i]) goto next_fmt; } return gfmt->glsl_format; next_fmt: ; // equivalent to `continue` } return NULL; } #define FOURCC(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \ ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) struct pl_fmt_fourcc { const char *name; uint32_t fourcc; }; static const struct pl_fmt_fourcc pl_fmt_fourccs[] = { // 8 bpp red {"r8", FOURCC('R','8',' ',' ')}, // 16 bpp red {"r16", FOURCC('R','1','6',' ')}, // 16 bpp rg {"rg8", FOURCC('G','R','8','8')}, {"gr8", FOURCC('R','G','8','8')}, // 32 bpp rg {"rg16", FOURCC('G','R','3','2')}, {"gr16", FOURCC('R','G','3','2')}, // 8 bpp rgb: N/A // 16 bpp rgb {"argb4", FOURCC('B','A','1','2')}, {"abgr4", FOURCC('R','A','1','2')}, {"rgba4", FOURCC('A','B','1','2')}, {"bgra4", FOURCC('A','R','1','2')}, {"a1rgb5", FOURCC('B','A','1','5')}, {"a1bgr5", FOURCC('R','A','1','5')}, {"rgb5a1", FOURCC('A','B','1','5')}, {"bgr5a1", FOURCC('A','R','1','5')}, {"rgb565", FOURCC('B','G','1','6')}, {"bgr565", FOURCC('R','G','1','6')}, // 24 bpp rgb {"rgb8", FOURCC('B','G','2','4')}, {"bgr8", FOURCC('R','G','2','4')}, // 32 bpp rgb {"argb8", FOURCC('B','A','2','4')}, {"abgr8", FOURCC('R','A','2','4')}, {"rgba8", FOURCC('A','B','2','4')}, {"bgra8", FOURCC('A','R','2','4')}, {"a2rgb10", FOURCC('B','A','3','0')}, {"a2bgr10", FOURCC('R','A','3','0')}, {"rgb10a2", FOURCC('A','B','3','0')}, {"bgr10a2", FOURCC('A','R','3','0')}, // 64bpp rgb {"rgba16hf", FOURCC('A','B','4','H')}, {"bgra16hf", FOURCC('A','R','4','H')}, // packed 16-bit formats // rx10: N/A // rxgx10: N/A {"rxgxbxax10", FOURCC('A','B','1','0')}, // rx12: N/A // rxgx12: N/A // rxgxbxax12: N/A // planar formats {"g8_b8_r8_420", FOURCC('Y','U','1','2')}, {"g8_b8_r8_422", FOURCC('Y','U','1','6')}, {"g8_b8_r8_444", FOURCC('Y','U','2','4')}, // g16_b18_r8_*: N/A // gx10_bx10_rx10_42*: N/A {"gx10_bx10_rx10_444", FOURCC('Q','4','1','0')}, // gx12_bx12_rx12_*:N/A {"g8_br8_420", FOURCC('N','V','1','2')}, {"g8_br8_422", FOURCC('N','V','1','6')}, {"g8_br8_444", FOURCC('N','V','2','4')}, {"g16_br16_420", FOURCC('P','0','1','6')}, // g16_br16_422: N/A // g16_br16_444: N/A {"gx10_bxrx10_420", FOURCC('P','0','1','0')}, {"gx10_bxrx10_422", FOURCC('P','2','1','0')}, // gx10_bxrx10_444: N/A {"gx12_bxrx12_420", FOURCC('P','0','1','2')}, // gx12_bxrx12_422: N/A // gx12_bxrx12_444: N/A }; uint32_t pl_fmt_fourcc(pl_fmt fmt) { for (int n = 0; n < PL_ARRAY_SIZE(pl_fmt_fourccs); n++) { const struct pl_fmt_fourcc *fourcc = &pl_fmt_fourccs[n]; if (strcmp(fmt->name, fourcc->name) == 0) return fourcc->fourcc; } return 0; // no matching format } size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par) { int w = pl_rect_w(par->rc), h = pl_rect_h(par->rc), d = pl_rect_d(par->rc); size_t pixel_pitch = par->tex->params.format->texel_size; // This generates the absolute bare minimum size of a buffer required to // hold the data of a texture upload/download, by including stride padding // only where strictly necessary. return (d - 1) * par->depth_pitch + (h - 1) * par->row_pitch + w * pixel_pitch; } int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt, const struct pl_tex_transfer_params *params, struct pl_tex_transfer_params **out_slices) { PL_ARRAY(struct pl_tex_transfer_params) slices = {0}; size_t max_size = params->buf ? gpu->limits.max_buf_size : SIZE_MAX; pl_fmt fmt = params->tex->params.format; if (fmt->emulated && texel_fmt) { size_t max_texel = gpu->limits.max_buffer_texels * texel_fmt->texel_size; max_size = PL_MIN(gpu->limits.max_ssbo_size, max_texel); } int slice_w = pl_rect_w(params->rc); int slice_h = pl_rect_h(params->rc); int slice_d = pl_rect_d(params->rc); slice_d = PL_MIN(slice_d, max_size / params->depth_pitch); if (!slice_d) { slice_d = 1; slice_h = PL_MIN(slice_h, max_size / params->row_pitch); if (!slice_h) { slice_h = 1; slice_w = PL_MIN(slice_w, max_size / fmt->texel_size); pl_assert(slice_w); } } for (int z = 0; z < pl_rect_d(params->rc); z += slice_d) { for (int y = 0; y < pl_rect_h(params->rc); y += slice_h) { for (int x = 0; x < pl_rect_w(params->rc); x += slice_w) { struct pl_tex_transfer_params slice = *params; slice.callback = NULL; slice.rc.x0 = params->rc.x0 + x; slice.rc.y0 = params->rc.y0 + y; slice.rc.z0 = params->rc.z0 + z; slice.rc.x1 = PL_MIN(slice.rc.x0 + slice_w, params->rc.x1); slice.rc.y1 = PL_MIN(slice.rc.y0 + slice_h, params->rc.y1); slice.rc.z1 = PL_MIN(slice.rc.z0 + slice_d, params->rc.z1); const size_t offset = z * params->depth_pitch + y * params->row_pitch + x * fmt->texel_size; if (slice.ptr) { slice.ptr = (uint8_t *) slice.ptr + offset; } else { slice.buf_offset += offset; } PL_ARRAY_APPEND(NULL, slices, slice); } } } *out_slices = slices.elem; return slices.num; } bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params) { if (params->buf) return pl_tex_upload(gpu, params); struct pl_buf_params bufparams = { .size = pl_tex_transfer_size(params), .debug_tag = PL_DEBUG_TAG, }; struct pl_tex_transfer_params fixed = *params; fixed.ptr = NULL; // If we can import host pointers directly, and the function is being used // asynchronously, then we can use host pointer import to skip a memcpy. In // the synchronous case, we still force a host memcpy to avoid stalling the // host until the GPU memcpy completes. bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR; can_import &= !params->no_import; can_import &= params->callback != NULL; can_import &= bufparams.size > (32 << 10); // 32 KiB if (can_import) { bufparams.import_handle = PL_HANDLE_HOST_PTR; bufparams.shared_mem = (struct pl_shared_mem) { .handle.ptr = params->ptr, .size = bufparams.size, .offset = 0, }; // Suppress errors for this test because it may fail, in which case we // want to silently fall back. pl_log_level_cap(gpu->log, PL_LOG_DEBUG); fixed.buf = pl_buf_create(gpu, &bufparams); pl_log_level_cap(gpu->log, PL_LOG_NONE); } if (!fixed.buf) { bufparams.import_handle = 0; bufparams.host_writable = true; fixed.buf = pl_buf_create(gpu, &bufparams); if (!fixed.buf) return false; pl_buf_write(gpu, fixed.buf, 0, params->ptr, bufparams.size); if (params->callback) params->callback(params->priv); fixed.callback = NULL; } bool ok = pl_tex_upload(gpu, &fixed); pl_buf_destroy(gpu, &fixed.buf); return ok; } struct pbo_cb_ctx { pl_gpu gpu; pl_buf buf; void *ptr; void (*callback)(void *priv); void *priv; }; static void pbo_download_cb(void *priv) { struct pbo_cb_ctx *p = priv; pl_buf_read(p->gpu, p->buf, 0, p->ptr, p->buf->params.size); pl_buf_destroy(p->gpu, &p->buf); // Run the original callback p->callback(p->priv); pl_free(priv); }; bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params) { if (params->buf) return pl_tex_download(gpu, params); pl_buf buf = NULL; struct pl_buf_params bufparams = { .size = pl_tex_transfer_size(params), .debug_tag = PL_DEBUG_TAG, }; // If we can import host pointers directly, we can avoid an extra memcpy // (sometimes). In the cases where it isn't avoidable, the extra memcpy // will happen inside VRAM, which is typically faster anyway. bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR; can_import &= !params->no_import; can_import &= bufparams.size > (32 << 10); // 32 KiB if (can_import) { bufparams.import_handle = PL_HANDLE_HOST_PTR; bufparams.shared_mem = (struct pl_shared_mem) { .handle.ptr = params->ptr, .size = bufparams.size, .offset = 0, }; // Suppress errors for this test because it may fail, in which case we // want to silently fall back. pl_log_level_cap(gpu->log, PL_LOG_DEBUG); buf = pl_buf_create(gpu, &bufparams); pl_log_level_cap(gpu->log, PL_LOG_NONE); } if (!buf) { // Fallback when host pointer import is not supported bufparams.import_handle = 0; bufparams.host_readable = true; buf = pl_buf_create(gpu, &bufparams); } if (!buf) return false; struct pl_tex_transfer_params newparams = *params; newparams.ptr = NULL; newparams.buf = buf; // If the transfer is asynchronous, propagate our host read asynchronously if (params->callback && !bufparams.import_handle) { newparams.callback = pbo_download_cb; newparams.priv = pl_alloc_struct(NULL, struct pbo_cb_ctx, { .gpu = gpu, .buf = buf, .ptr = params->ptr, .callback = params->callback, .priv = params->priv, }); } if (!pl_tex_download(gpu, &newparams)) { pl_buf_destroy(gpu, &buf); return false; } if (!params->callback) { while (pl_buf_poll(gpu, buf, 10000000)) // 10 ms PL_TRACE(gpu, "pl_tex_download: synchronous/blocking (slow path)"); } bool ok; if (bufparams.import_handle) { // Buffer download completion already means the host pointer contains // the valid data, no more need to copy. (Note: this applies even for // asynchronous downloads) ok = true; pl_buf_destroy(gpu, &buf); } else if (!params->callback) { // Synchronous read back to the host pointer ok = pl_buf_read(gpu, buf, 0, params->ptr, bufparams.size); pl_buf_destroy(gpu, &buf); } else { // Nothing left to do here, the rest will be done by pbo_download_cb ok = true; } return ok; } bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params) { const int threads = PL_MIN(256, pl_rect_w(params->rc)); pl_tex tex = params->tex; pl_fmt fmt = tex->params.format; pl_require(gpu, params->buf); pl_dispatch dp = pl_gpu_dispatch(gpu); pl_shader sh = pl_dispatch_begin(dp); if (!sh_try_compute(sh, threads, 1, false, 0)) { PL_ERR(gpu, "Failed emulating texture transfer!"); pl_dispatch_abort(dp, &sh); return false; } ident_t buf = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->buf, .desc = { .name = "data", .type = PL_DESC_BUF_TEXEL_STORAGE, }, }); ident_t img = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->tex, .desc = { .name = "image", .type = PL_DESC_STORAGE_IMG, .access = PL_DESC_ACCESS_WRITEONLY, }, }); // If the transfer width is a natural multiple of the thread size, we // can skip the bounds check. Otherwise, make sure we aren't blitting out // of the range since this would read out of bounds. int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads); if (groups_x * threads != pl_rect_w(params->rc)) { GLSL("if (gl_GlobalInvocationID.x >= %d) \n" " return; \n", pl_rect_w(params->rc)); } // fmt->texel_align contains the size of an individual color value assert(fmt->texel_size == fmt->num_components * fmt->texel_align); GLSL("vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \n" "ivec3 pos = ivec3(gl_GlobalInvocationID); \n" "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n" "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n", SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0), SH_INT_DYN(params->buf_offset), SH_INT(params->depth_pitch / fmt->texel_align), SH_INT(params->row_pitch / fmt->texel_align), SH_INT(fmt->texel_size / fmt->texel_align)); for (int i = 0; i < fmt->num_components; i++) GLSL("color[%d] = imageLoad("$", base + %d).r; \n", i, buf, i); int dims = pl_tex_params_dimension(tex->params); static const char *coord_types[] = { [1] = "int", [2] = "ivec2", [3] = "ivec3", }; GLSL("imageStore("$", %s(tex_pos), color);\n", img, coord_types[dims]); return pl_dispatch_compute(dp, pl_dispatch_compute_params( .shader = &sh, .dispatch_size = { groups_x, pl_rect_h(params->rc), pl_rect_d(params->rc), }, )); error: return false; } bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params) { const int threads = PL_MIN(256, pl_rect_w(params->rc)); pl_tex tex = params->tex; pl_fmt fmt = tex->params.format; pl_require(gpu, params->buf); pl_dispatch dp = pl_gpu_dispatch(gpu); pl_shader sh = pl_dispatch_begin(dp); if (!sh_try_compute(sh, threads, 1, false, 0)) { PL_ERR(gpu, "Failed emulating texture transfer!"); pl_dispatch_abort(dp, &sh); return false; } ident_t buf = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->buf, .desc = { .name = "data", .type = PL_DESC_BUF_TEXEL_STORAGE, }, }); ident_t img = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->tex, .desc = { .name = "image", .type = PL_DESC_STORAGE_IMG, .access = PL_DESC_ACCESS_READONLY, }, }); int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads); if (groups_x * threads != pl_rect_w(params->rc)) { GLSL("if (gl_GlobalInvocationID.x >= %d) \n" " return; \n", pl_rect_w(params->rc)); } int dims = pl_tex_params_dimension(tex->params); static const char *coord_types[] = { [1] = "int", [2] = "ivec2", [3] = "ivec3", }; assert(fmt->texel_size == fmt->num_components * fmt->texel_align); GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n" "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n" "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n" "vec4 color = imageLoad("$", %s(tex_pos)); \n", SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0), SH_INT_DYN(params->buf_offset), SH_INT(params->depth_pitch / fmt->texel_align), SH_INT(params->row_pitch / fmt->texel_align), SH_INT(fmt->texel_size / fmt->texel_align), img, coord_types[dims]); for (int i = 0; i < fmt->num_components; i++) GLSL("imageStore("$", base + %d, vec4(color[%d])); \n", buf, i, i); return pl_dispatch_compute(dp, pl_dispatch_compute_params( .shader = &sh, .dispatch_size = { groups_x, pl_rect_h(params->rc), pl_rect_d(params->rc), }, )); error: return false; } bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params) { if (!params->dst->params.storable) return false; // Normalize `dst_rc`, moving all flipping to `src_rc` instead. pl_rect3d src_rc = params->src_rc; pl_rect3d dst_rc = params->dst_rc; if (pl_rect_w(dst_rc) < 0) { PL_SWAP(src_rc.x0, src_rc.x1); PL_SWAP(dst_rc.x0, dst_rc.x1); } if (pl_rect_h(dst_rc) < 0) { PL_SWAP(src_rc.y0, src_rc.y1); PL_SWAP(dst_rc.y0, dst_rc.y1); } if (pl_rect_d(dst_rc) < 0) { PL_SWAP(src_rc.z0, src_rc.z1); PL_SWAP(dst_rc.z0, dst_rc.z1); } bool needs_scaling = false; needs_scaling |= pl_rect_w(dst_rc) != abs(pl_rect_w(src_rc)); needs_scaling |= pl_rect_h(dst_rc) != abs(pl_rect_h(src_rc)); needs_scaling |= pl_rect_d(dst_rc) != abs(pl_rect_d(src_rc)); // Exception: fast path for 1-pixel blits, which don't require scaling bool is_1pixel = abs(pl_rect_w(src_rc)) == 1 && abs(pl_rect_h(src_rc)) == 1; needs_scaling &= !is_1pixel; // Manual trilinear interpolation would be too slow to justify bool needs_sampling = needs_scaling && params->sample_mode != PL_TEX_SAMPLE_NEAREST; needs_sampling |= !params->src->params.storable; if (needs_sampling && !params->src->params.sampleable) return false; const int threads = 256; int bw = PL_MIN(32, pl_rect_w(dst_rc)); int bh = PL_MIN(threads / bw, pl_rect_h(dst_rc)); pl_dispatch dp = pl_gpu_dispatch(gpu); pl_shader sh = pl_dispatch_begin(dp); if (!sh_try_compute(sh, bw, bh, false, 0)) { pl_dispatch_abort(dp, &sh); return false; } // Avoid over-writing into `dst` int groups_x = PL_DIV_UP(pl_rect_w(dst_rc), bw); if (groups_x * bw != pl_rect_w(dst_rc)) { GLSL("if (gl_GlobalInvocationID.x >= %d) \n" " return; \n", pl_rect_w(dst_rc)); } int groups_y = PL_DIV_UP(pl_rect_h(dst_rc), bh); if (groups_y * bh != pl_rect_h(dst_rc)) { GLSL("if (gl_GlobalInvocationID.y >= %d) \n" " return; \n", pl_rect_h(dst_rc)); } ident_t dst = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->dst, .desc = { .name = "dst", .type = PL_DESC_STORAGE_IMG, .access = PL_DESC_ACCESS_WRITEONLY, }, }); static const char *vecs[] = { [1] = "float", [2] = "vec2", [3] = "vec3", [4] = "vec4", }; static const char *ivecs[] = { [1] = "int", [2] = "ivec2", [3] = "ivec3", [4] = "ivec4", }; int src_dims = pl_tex_params_dimension(params->src->params); int dst_dims = pl_tex_params_dimension(params->dst->params); GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n" "%s dst_pos = %s(pos + ivec3(%d, %d, %d)); \n", ivecs[dst_dims], ivecs[dst_dims], params->dst_rc.x0, params->dst_rc.y0, params->dst_rc.z0); if (needs_sampling || (needs_scaling && params->src->params.sampleable)) { ident_t src = sh_desc(sh, (struct pl_shader_desc) { .desc = { .name = "src", .type = PL_DESC_SAMPLED_TEX, }, .binding = { .object = params->src, .address_mode = PL_TEX_ADDRESS_CLAMP, .sample_mode = params->sample_mode, } }); if (is_1pixel) { GLSL("%s fpos = %s(0.5); \n", vecs[src_dims], vecs[src_dims]); } else { GLSL("vec3 fpos = (vec3(pos) + vec3(0.5)) / vec3(%d.0, %d.0, %d.0); \n", pl_rect_w(dst_rc), pl_rect_h(dst_rc), pl_rect_d(dst_rc)); } GLSL("%s src_pos = %s(0.5); \n" "src_pos.x = mix(%f, %f, fpos.x); \n", vecs[src_dims], vecs[src_dims], (float) src_rc.x0 / params->src->params.w, (float) src_rc.x1 / params->src->params.w); if (params->src->params.h) { GLSL("src_pos.y = mix(%f, %f, fpos.y); \n", (float) src_rc.y0 / params->src->params.h, (float) src_rc.y1 / params->src->params.h); } if (params->src->params.d) { GLSL("src_pos.z = mix(%f, %f, fpos.z); \n", (float) src_rc.z0 / params->src->params.d, (float) src_rc.z1 / params->src->params.d); } GLSL("imageStore("$", dst_pos, textureLod("$", src_pos, 0.0)); \n", dst, src); } else { ident_t src = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->src, .desc = { .name = "src", .type = PL_DESC_STORAGE_IMG, .access = PL_DESC_ACCESS_READONLY, }, }); if (is_1pixel) { GLSL("ivec3 src_pos = ivec3(0); \n"); } else if (needs_scaling) { GLSL("ivec3 src_pos = ivec3(vec3(%f, %f, %f) * vec3(pos)); \n", fabs((float) pl_rect_w(src_rc) / pl_rect_w(dst_rc)), fabs((float) pl_rect_h(src_rc) / pl_rect_h(dst_rc)), fabs((float) pl_rect_d(src_rc) / pl_rect_d(dst_rc))); } else { GLSL("ivec3 src_pos = pos; \n"); } GLSL("src_pos = ivec3(%d, %d, %d) * src_pos + ivec3(%d, %d, %d); \n" "imageStore("$", dst_pos, imageLoad("$", %s(src_pos))); \n", src_rc.x1 < src_rc.x0 ? -1 : 1, src_rc.y1 < src_rc.y0 ? -1 : 1, src_rc.z1 < src_rc.z0 ? -1 : 1, src_rc.x0, src_rc.y0, src_rc.z0, dst, src, ivecs[src_dims]); } return pl_dispatch_compute(dp, pl_dispatch_compute_params( .shader = &sh, .dispatch_size = { groups_x, groups_y, pl_rect_d(dst_rc), }, )); } void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params) { enum pl_fmt_type src_type = params->src->params.format->type; enum pl_fmt_type dst_type = params->dst->params.format->type; // Only for 2D textures pl_assert(params->src->params.h && !params->src->params.d); pl_assert(params->dst->params.h && !params->dst->params.d); // Integer textures are not supported pl_assert(src_type != PL_FMT_UINT && src_type != PL_FMT_SINT); pl_assert(dst_type != PL_FMT_UINT && dst_type != PL_FMT_SINT); pl_rect2df src_rc = { .x0 = params->src_rc.x0, .x1 = params->src_rc.x1, .y0 = params->src_rc.y0, .y1 = params->src_rc.y1, }; pl_rect2d dst_rc = { .x0 = params->dst_rc.x0, .x1 = params->dst_rc.x1, .y0 = params->dst_rc.y0, .y1 = params->dst_rc.y1, }; pl_dispatch dp = pl_gpu_dispatch(gpu); pl_shader sh = pl_dispatch_begin(dp); sh->output = PL_SHADER_SIG_COLOR; ident_t pos, src = sh_bind(sh, params->src, PL_TEX_ADDRESS_CLAMP, params->sample_mode, "src_tex", &src_rc, &pos, NULL); GLSL("vec4 color = textureLod("$", "$", 0.0); \n", src, pos); pl_dispatch_finish(dp, pl_dispatch_params( .shader = &sh, .target = params->dst, .rect = dst_rc, )); } bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params) { pl_buf src = params->src, dst = params->dst; pl_require(gpu, src->params.storable && dst->params.storable); pl_require(gpu, params->src_offset % sizeof(unsigned) == 0); pl_require(gpu, params->dst_offset % sizeof(unsigned) == 0); pl_require(gpu, params->src_offset + params->size <= src->params.size); pl_require(gpu, params->dst_offset + params->size <= dst->params.size); pl_require(gpu, src != dst || params->src_offset == params->dst_offset); pl_require(gpu, params->size % sizeof(unsigned) == 0); pl_require(gpu, params->wordsize == sizeof(uint16_t) || params->wordsize == sizeof(uint32_t)); const size_t words = params->size / sizeof(unsigned); const size_t src_off = params->src_offset / sizeof(unsigned); const size_t dst_off = params->dst_offset / sizeof(unsigned); const int threads = PL_MIN(256, words); pl_dispatch dp = pl_gpu_dispatch(gpu); pl_shader sh = pl_dispatch_begin(dp); if (!sh_try_compute(sh, threads, 1, false, 0)) { pl_dispatch_abort(dp, &sh); return false; } const size_t groups = PL_DIV_UP(words, threads); if (groups * threads > words) { GLSL("if (gl_GlobalInvocationID.x >= %zu) \n" " return; \n", words); } sh_desc(sh, (struct pl_shader_desc) { .binding.object = src, .desc = { .name = "SrcBuf", .type = PL_DESC_BUF_STORAGE, .access = src == dst ? PL_DESC_ACCESS_READWRITE : PL_DESC_ACCESS_READONLY, }, .num_buffer_vars = 1, .buffer_vars = &(struct pl_buffer_var) { .var = { .name = "src", .type = PL_VAR_UINT, .dim_v = 1, .dim_m = 1, .dim_a = src_off + words, }, }, }); if (src != dst) { sh_desc(sh, (struct pl_shader_desc) { .binding.object = dst, .desc = { .name = "DstBuf", .type = PL_DESC_BUF_STORAGE, .access = PL_DESC_ACCESS_WRITEONLY, }, .num_buffer_vars = 1, .buffer_vars = &(struct pl_buffer_var) { .var = { .name = "dst", .type = PL_VAR_UINT, .dim_v = 1, .dim_m = 1, .dim_a = dst_off + words, }, }, }); } else { GLSL("#define dst src \n"); } GLSL("// pl_buf_copy_swap \n" "{ \n" "uint word = src["$" + gl_GlobalInvocationID.x]; \n" "word = (word & 0xFF00FF00u) >> 8 | \n" " (word & 0x00FF00FFu) << 8; \n", SH_UINT(src_off)); if (params->wordsize > 2) { GLSL("word = (word & 0xFFFF0000u) >> 16 | \n" " (word & 0x0000FFFFu) << 16; \n"); } GLSL("dst["$" + gl_GlobalInvocationID.x] = word; \n" "} \n", SH_UINT(dst_off)); return pl_dispatch_compute(dp, pl_dispatch_compute_params( .shader = &sh, .dispatch_size = {groups, 1, 1}, )); error: if (src->params.debug_tag || dst->params.debug_tag) { PL_ERR(gpu, " for buffers: src %s, dst %s", src->params.debug_tag, dst->params.debug_tag); } return false; } void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params) { if (!params->vertex_data && !params->index_data) return pl_pass_run(gpu, params); struct pl_pass_run_params newparams = *params; pl_buf vert = NULL, index = NULL; if (params->vertex_data) { vert = pl_buf_create(gpu, pl_buf_params( .size = pl_vertex_buf_size(params), .initial_data = params->vertex_data, .drawable = true, )); if (!vert) { PL_ERR(gpu, "Failed allocating vertex buffer!"); return; } newparams.vertex_buf = vert; newparams.vertex_data = NULL; } if (params->index_data) { index = pl_buf_create(gpu, pl_buf_params( .size = pl_index_buf_size(params), .initial_data = params->index_data, .drawable = true, )); if (!index) { PL_ERR(gpu, "Failed allocating index buffer!"); return; } newparams.index_buf = index; newparams.index_data = NULL; } pl_pass_run(gpu, &newparams); pl_buf_destroy(gpu, &vert); pl_buf_destroy(gpu, &index); } struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params) { struct pl_pass_params new = *params; new.glsl_shader = pl_str0dup0(alloc, new.glsl_shader); new.vertex_shader = pl_str0dup0(alloc, new.vertex_shader); if (new.blend_params) new.blend_params = pl_memdup_ptr(alloc, new.blend_params); #define DUPNAMES(field) \ do { \ size_t _size = new.num_##field * sizeof(new.field[0]); \ new.field = pl_memdup(alloc, new.field, _size); \ for (int j = 0; j < new.num_##field; j++) \ new.field[j].name = pl_str0dup0(alloc, new.field[j].name); \ } while (0) DUPNAMES(variables); DUPNAMES(descriptors); DUPNAMES(vertex_attribs); #undef DUPNAMES new.constant_data = NULL; new.constants = pl_memdup(alloc, new.constants, new.num_constants * sizeof(new.constants[0])); return new; } size_t pl_vertex_buf_size(const struct pl_pass_run_params *params) { if (!params->index_data) return params->vertex_count * params->pass->params.vertex_stride; int num_vertices = 0; const void *idx = params->index_data; switch (params->index_fmt) { case PL_INDEX_UINT16: for (int i = 0; i < params->vertex_count; i++) num_vertices = PL_MAX(num_vertices, ((const uint16_t *) idx)[i]); break; case PL_INDEX_UINT32: for (int i = 0; i < params->vertex_count; i++) num_vertices = PL_MAX(num_vertices, ((const uint32_t *) idx)[i]); break; case PL_INDEX_FORMAT_COUNT: pl_unreachable(); } return (num_vertices + 1) * params->pass->params.vertex_stride; } const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]) { static const char *hexdigits = "0123456789ABCDEF"; for (int i = 0; i < UUID_SIZE; i++) { uint8_t x = uuid[i]; buf[3 * i + 0] = hexdigits[x >> 4]; buf[3 * i + 1] = hexdigits[x & 0xF]; buf[3 * i + 2] = i == UUID_SIZE - 1 ? '\0' : ':'; } return buf; } const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod) { switch (mod) { case DRM_FORMAT_MOD_LINEAR: return "LINEAR"; case DRM_FORMAT_MOD_INVALID: return "INVALID"; } uint8_t vendor = mod >> 56; uint64_t val = mod & ((1ULL << 56) - 1); const char *name = NULL; switch (vendor) { case 0x00: name = "NONE"; break; case 0x01: name = "INTEL"; break; case 0x02: name = "AMD"; break; case 0x03: name = "NVIDIA"; break; case 0x04: name = "SAMSUNG"; break; case 0x08: name = "ARM"; break; } if (name) { snprintf(buf, DRM_MOD_SIZE, "%s 0x%"PRIx64, name, val); } else { snprintf(buf, DRM_MOD_SIZE, "0x%02x 0x%"PRIx64, vendor, val); } return buf; }