summaryrefslogtreecommitdiffstats
path: root/src/d3d11/gpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/d3d11/gpu.c')
-rw-r--r--src/d3d11/gpu.c685
1 files changed, 685 insertions, 0 deletions
diff --git a/src/d3d11/gpu.c b/src/d3d11/gpu.c
new file mode 100644
index 0000000..05a08a3
--- /dev/null
+++ b/src/d3d11/gpu.c
@@ -0,0 +1,685 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <windows.h>
+#include <versionhelpers.h>
+
+#include "common.h"
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8)
+
+struct timer_query {
+ ID3D11Query *ts_start;
+ ID3D11Query *ts_end;
+ ID3D11Query *disjoint;
+};
+
+struct pl_timer_t {
+ // Ring buffer of timer queries to use
+ int current;
+ int pending;
+ struct timer_query queries[16];
+};
+
+void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ if (!timer)
+ return;
+ struct timer_query *query = &timer->queries[timer->current];
+
+ // Create the query objects lazilly
+ if (!query->ts_start) {
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start));
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end));
+
+ // Measuring duration in D3D11 requires three queries: start and end
+ // timestamp queries, and a disjoint query containing a flag which says
+ // whether the timestamps are usable or if a discontinuity occurred
+ // between them, like a change in power state or clock speed. The
+ // disjoint query also contains the timer frequency, so the timestamps
+ // are useless without it.
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint));
+ }
+
+ // Query the start timestamp
+ ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint);
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start);
+ return;
+
+error:
+ SAFE_RELEASE(query->ts_start);
+ SAFE_RELEASE(query->ts_end);
+ SAFE_RELEASE(query->disjoint);
+}
+
+void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+ if (!timer)
+ return;
+ struct timer_query *query = &timer->queries[timer->current];
+
+ // Even if timer_start and timer_end are called in-order, timer_start might
+ // have failed to create the timer objects
+ if (!query->ts_start)
+ return;
+
+ // Query the end timestamp
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end);
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint);
+
+ // Advance to the next set of queries, for the next call to timer_start
+ timer->current++;
+ if (timer->current >= PL_ARRAY_SIZE(timer->queries))
+ timer->current = 0; // Wrap around
+
+ // Increment the number of pending queries, unless the ring buffer is full,
+ // in which case, timer->current now points to the oldest one, which will be
+ // dropped and reused
+ if (timer->pending < PL_ARRAY_SIZE(timer->queries))
+ timer->pending++;
+}
+
+static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
+{
+ static const uint64_t ns_per_s = 1000000000llu;
+ return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
+}
+
+static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ HRESULT hr;
+
+ for (; timer->pending > 0; timer->pending--) {
+ int index = timer->current - timer->pending;
+ if (index < 0)
+ index += PL_ARRAY_SIZE(timer->queries);
+ struct timer_query *query = &timer->queries[index];
+
+ UINT64 start, end;
+ D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
+
+ // Fetch the results of each query, or on S_FALSE, return 0 to indicate
+ // the queries are still pending
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj),
+ D3D11_ASYNC_GETDATA_DONOTFLUSH));
+ if (hr == S_FALSE)
+ return 0;
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end),
+ D3D11_ASYNC_GETDATA_DONOTFLUSH));
+ if (hr == S_FALSE)
+ return 0;
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start),
+ D3D11_ASYNC_GETDATA_DONOTFLUSH));
+ if (hr == S_FALSE)
+ return 0;
+
+ // There was a discontinuity during the queries, so a timestamp can't be
+ // produced. Skip it and try the next one.
+ if (dj.Disjoint || !dj.Frequency)
+ continue;
+
+ // We got a result. Return it to the caller.
+ timer->pending--;
+ pl_d3d11_flush_message_queue(ctx, "After timer query");
+
+ uint64_t ns = timestamp_to_ns(end - start, dj.Frequency);
+ return PL_MAX(ns, 1);
+
+ error:
+ // There was an error fetching the timer result, so skip it and try the
+ // next one
+ continue;
+ }
+
+ // No more unprocessed results
+ return 0;
+}
+
+static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) {
+ SAFE_RELEASE(timer->queries[i].ts_start);
+ SAFE_RELEASE(timer->queries[i].ts_end);
+ SAFE_RELEASE(timer->queries[i].disjoint);
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After timer destroy");
+
+ pl_free(timer);
+}
+
+static pl_timer d3d11_timer_create(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ if (!p->has_timestamp_queries)
+ return NULL;
+
+ struct pl_timer_t *timer = pl_alloc_ptr(NULL, timer);
+ *timer = (struct pl_timer_t) {0};
+ return timer;
+}
+
+static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ // Vulkan-style binding, where all descriptors are in the same namespace, is
+ // required to use SPIRV-Cross' HLSL resource mapping API, which targets
+ // resources by binding number
+ return 0;
+}
+
+static void d3d11_gpu_flush(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ ID3D11DeviceContext_Flush(p->imm);
+
+ pl_d3d11_flush_message_queue(ctx, "After gpu flush");
+}
+
+static void d3d11_gpu_finish(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ HRESULT hr;
+
+ if (p->finish_fence) {
+ p->finish_value++;
+ D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value,
+ p->finish_event));
+ ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value);
+ ID3D11DeviceContext_Flush(p->imm);
+ WaitForSingleObject(p->finish_event, INFINITE);
+ } else {
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query);
+
+ // D3D11 doesn't have blocking queries, but it does have blocking
+ // readback. As a performance hack to try to avoid polling, do a dummy
+ // copy/readback between two buffers. Hopefully this will block until
+ // all prior commands are finished. If it does, the first GetData call
+ // will return a result and we won't have to poll.
+ pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t));
+ pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t));
+
+ // Poll the event query until it completes
+ for (;;) {
+ BOOL idle;
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0));
+ if (hr == S_OK && idle)
+ break;
+ Sleep(1);
+ }
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After gpu finish");
+
+error:
+ return;
+}
+
+static bool d3d11_gpu_is_failed(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ if (ctx->is_failed)
+ return true;
+
+ // GetDeviceRemovedReason returns S_OK if the device isn't removed
+ HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev);
+ if (FAILED(hr)) {
+ ctx->is_failed = true;
+ pl_d3d11_after_error(ctx, hr);
+ }
+
+ return ctx->is_failed;
+}
+
+static void d3d11_gpu_destroy(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+ pl_buf_destroy(gpu, &p->finish_buf_src);
+ pl_buf_destroy(gpu, &p->finish_buf_dst);
+
+ // Release everything except the immediate context
+ SAFE_RELEASE(p->dev);
+ SAFE_RELEASE(p->dev1);
+ SAFE_RELEASE(p->dev5);
+ SAFE_RELEASE(p->imm1);
+ SAFE_RELEASE(p->imm4);
+ SAFE_RELEASE(p->vbuf.buf);
+ SAFE_RELEASE(p->ibuf.buf);
+ SAFE_RELEASE(p->rstate);
+ SAFE_RELEASE(p->dsstate);
+ for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) {
+ for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) {
+ SAFE_RELEASE(p->samplers[i][j]);
+ }
+ }
+ SAFE_RELEASE(p->finish_fence);
+ if (p->finish_event)
+ CloseHandle(p->finish_event);
+ SAFE_RELEASE(p->finish_query);
+
+ // Destroy the immediate context synchronously so referenced objects don't
+ // show up in the leak check
+ if (p->imm) {
+ ID3D11DeviceContext_ClearState(p->imm);
+ ID3D11DeviceContext_Flush(p->imm);
+ SAFE_RELEASE(p->imm);
+ }
+
+ pl_spirv_destroy(&p->spirv);
+ pl_free((void *) gpu);
+}
+
+pl_d3d11 pl_d3d11_get(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->destroy == d3d11_gpu_destroy) {
+ struct pl_gpu_d3d11 *p = (struct pl_gpu_d3d11 *) impl;
+ return p->ctx->d3d11;
+ }
+
+ return NULL;
+}
+
+static bool load_d3d_compiler(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ HMODULE d3dcompiler = NULL;
+
+ static const struct {
+ const wchar_t *name;
+ bool inbox;
+ } compiler_dlls[] = {
+ // Try the inbox D3DCompiler first (Windows 8.1 and up)
+ { .name = L"d3dcompiler_47.dll", .inbox = true },
+ // Check for a packaged version of d3dcompiler_47.dll
+ { .name = L"d3dcompiler_47.dll" },
+ // Try d3dcompiler_46.dll from the Windows 8 SDK
+ { .name = L"d3dcompiler_46.dll" },
+ // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
+ { .name = L"d3dcompiler_43.dll" },
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) {
+ if (compiler_dlls[i].inbox) {
+ if (!IsWindows8Point1OrGreater())
+ continue;
+ d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL,
+ LOAD_LIBRARY_SEARCH_SYSTEM32);
+ } else {
+ d3dcompiler = LoadLibraryW(compiler_dlls[i].name);
+ }
+ if (!d3dcompiler)
+ continue;
+
+ p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile");
+ if (!p->D3DCompile)
+ return false;
+ p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name);
+
+ return true;
+ }
+
+ return false;
+}
+
+static struct pl_gpu_fns pl_fns_d3d11 = {
+ .tex_create = pl_d3d11_tex_create,
+ .tex_destroy = pl_d3d11_tex_destroy,
+ .tex_invalidate = pl_d3d11_tex_invalidate,
+ .tex_clear_ex = pl_d3d11_tex_clear_ex,
+ .tex_blit = pl_d3d11_tex_blit,
+ .tex_upload = pl_d3d11_tex_upload,
+ .tex_download = pl_d3d11_tex_download,
+ .buf_create = pl_d3d11_buf_create,
+ .buf_destroy = pl_d3d11_buf_destroy,
+ .buf_write = pl_d3d11_buf_write,
+ .buf_read = pl_d3d11_buf_read,
+ .buf_copy = pl_d3d11_buf_copy,
+ .desc_namespace = d3d11_desc_namespace,
+ .pass_create = pl_d3d11_pass_create,
+ .pass_destroy = pl_d3d11_pass_destroy,
+ .pass_run = pl_d3d11_pass_run,
+ .timer_create = d3d11_timer_create,
+ .timer_destroy = d3d11_timer_destroy,
+ .timer_query = d3d11_timer_query,
+ .gpu_flush = d3d11_gpu_flush,
+ .gpu_finish = d3d11_gpu_finish,
+ .gpu_is_failed = d3d11_gpu_is_failed,
+ .destroy = d3d11_gpu_destroy,
+};
+
+pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx)
+{
+ pl_assert(ctx->dev);
+ IDXGIDevice1 *dxgi_dev = NULL;
+ IDXGIAdapter1 *adapter = NULL;
+ IDXGIAdapter4 *adapter4 = NULL;
+ bool success = false;
+ HRESULT hr;
+
+ struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11);
+ gpu->log = ctx->log;
+
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ uint32_t spirv_ver = PL_MIN(SPV_VERSION, PL_MAX_SPIRV_VER);
+ *p = (struct pl_gpu_d3d11) {
+ .ctx = ctx,
+ .impl = pl_fns_d3d11,
+ .dev = ctx->dev,
+ .spirv = pl_spirv_create(ctx->log, (struct pl_spirv_version) {
+ .env_version = pl_spirv_version_to_vulkan(spirv_ver),
+ .spv_version = spirv_ver,
+ }),
+ .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER,
+ .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER,
+ };
+ if (!p->spirv)
+ goto error;
+
+ ID3D11Device_AddRef(p->dev);
+ ID3D11Device_GetImmediateContext(p->dev, &p->imm);
+
+ // Check D3D11.1 interfaces
+ hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
+ (void **) &p->dev1);
+ if (SUCCEEDED(hr)) {
+ p->minor = 1;
+ ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1);
+ }
+
+ // Check D3D11.4 interfaces
+ hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5,
+ (void **) &p->dev5);
+ if (SUCCEEDED(hr)) {
+ // There is no GetImmediateContext4 method
+ hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4,
+ (void **) &p->imm4);
+ if (SUCCEEDED(hr))
+ p->minor = 4;
+ }
+
+ PL_INFO(gpu, "Using Direct3D 11.%d runtime", p->minor);
+
+ D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev));
+ D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
+
+ DXGI_ADAPTER_DESC1 adapter_desc = {0};
+ IDXGIAdapter1_GetDesc1(adapter, &adapter_desc);
+
+ // No resource can be larger than max_res_size in bytes
+ unsigned int max_res_size = PL_CLAMP(
+ D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory,
+ D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u,
+ D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u);
+
+ gpu->glsl = (struct pl_glsl_version) {
+ .version = 450,
+ .vulkan = true,
+ };
+
+ gpu->limits = (struct pl_gpu_limits) {
+ .max_buf_size = max_res_size,
+ .max_ssbo_size = max_res_size,
+ .max_vbo_size = max_res_size,
+ .align_vertex_stride = 1,
+
+ // Make up some values
+ .align_tex_xfer_offset = 32,
+ .align_tex_xfer_pitch = 1,
+ .fragment_queues = 1,
+ };
+
+ p->fl = ID3D11Device_GetFeatureLevel(p->dev);
+
+ // If we're not using FL9_x, we can use the same suballocated buffer as a
+ // vertex buffer and index buffer
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0)
+ p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER;
+
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM;
+ } else {
+ // 10level9 restriction:
+ // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+ gpu->limits.max_ubo_size = 255 * CBUF_ELEM;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION;
+ gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION;
+ gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
+ gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ // Same limit as FL9_1
+ gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ } else {
+ gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ gpu->limits.max_buffer_texels =
+ 1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ gpu->glsl.compute = true;
+ gpu->limits.compute_queues = 1;
+ // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which
+ // is used to emulate blits on 11_0 and up, supports 1D and 3D textures
+ gpu->limits.blittable_1d_3d = true;
+
+ gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float);
+ gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP;
+ gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X;
+ gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y;
+ gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z;
+ gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] =
+ gpu->limits.max_dispatch[2] =
+ D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ // The offset limits are defined by HLSL:
+ // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm-
+ gpu->glsl.min_gather_offset = -32;
+ gpu->glsl.max_gather_offset = 31;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+ // SM4.1 has no gather4_po, so the offset must be specified by an
+ // immediate with a range of [-8, 7]
+ // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm-
+ // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset
+ gpu->glsl.min_gather_offset = -8;
+ gpu->glsl.max_gather_offset = 7;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT;
+ } else {
+ // 10level9 restriction:
+ // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+ p->max_srvs = 8;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
+ p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
+ } else {
+ p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
+ }
+
+ if (!load_d3d_compiler(gpu)) {
+ PL_FATAL(gpu, "Could not find D3DCompiler DLL");
+ goto error;
+ }
+ PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u",
+ p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
+ p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
+
+ // Detect support for timestamp queries. Some FL9_x devices don't support them.
+ hr = ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
+ p->has_timestamp_queries = SUCCEEDED(hr);
+
+ pl_d3d11_setup_formats(gpu);
+
+ // The rasterizer state never changes, so create it here
+ D3D11_RASTERIZER_DESC rdesc = {
+ .FillMode = D3D11_FILL_SOLID,
+ .CullMode = D3D11_CULL_NONE,
+ .FrontCounterClockwise = FALSE,
+ .DepthClipEnable = TRUE, // Required for 10level9
+ .ScissorEnable = TRUE,
+ };
+ D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate));
+
+ // The depth stencil state never changes either, and we only set it to turn
+ // depth testing off so the debug layer doesn't complain about an unbound
+ // depth buffer
+ D3D11_DEPTH_STENCIL_DESC dsdesc = {
+ .DepthEnable = FALSE,
+ .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL,
+ .DepthFunc = D3D11_COMPARISON_LESS,
+ .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK,
+ .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK,
+ .FrontFace = {
+ .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+ .StencilFunc = D3D11_COMPARISON_ALWAYS,
+ },
+ .BackFace = {
+ .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+ .StencilFunc = D3D11_COMPARISON_ALWAYS,
+ },
+ };
+ D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate));
+
+ // Initialize the samplers
+ for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) {
+ for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) {
+ static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = {
+ [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP,
+ [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP,
+ [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR,
+ };
+ static const D3D11_FILTER d3d_filter[] = {
+ [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT,
+ [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
+ };
+
+ D3D11_SAMPLER_DESC sdesc = {
+ .AddressU = d3d_address_mode[address_mode],
+ .AddressV = d3d_address_mode[address_mode],
+ .AddressW = d3d_address_mode[address_mode],
+ .ComparisonFunc = D3D11_COMPARISON_NEVER,
+ .MinLOD = 0,
+ .MaxLOD = D3D11_FLOAT32_MAX,
+ .MaxAnisotropy = 1,
+ .Filter = d3d_filter[sample_mode],
+ };
+ D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc,
+ &p->samplers[sample_mode][address_mode]));
+ }
+ }
+
+ hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4,
+ (void **) &adapter4);
+ if (SUCCEEDED(hr)) {
+ DXGI_ADAPTER_DESC3 adapter_desc3 = {0};
+ IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3);
+
+ p->has_monitored_fences =
+ adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES;
+ }
+
+ // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish()
+ if (p->dev5 && p->has_monitored_fences) {
+ hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE,
+ &IID_ID3D11Fence,
+ (void **) &p->finish_fence);
+ if (SUCCEEDED(hr)) {
+ p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL);
+ if (!p->finish_event) {
+ PL_ERR(gpu, "Failed to create finish() event");
+ goto error;
+ }
+ }
+ }
+
+ // If fences are not available, we will have to poll a event query instead
+ if (!p->finish_fence) {
+ // Buffers for dummy copy/readback (see d3d11_gpu_finish())
+ p->finish_buf_src = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(uint32_t),
+ .drawable = true, // Make these vertex buffers for 10level9
+ .initial_data = &(uint32_t) {0x11223344},
+ ));
+ p->finish_buf_dst = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(uint32_t),
+ .host_readable = true,
+ .drawable = true,
+ ));
+
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query));
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After gpu create");
+
+ success = true;
+error:
+ SAFE_RELEASE(dxgi_dev);
+ SAFE_RELEASE(adapter);
+ SAFE_RELEASE(adapter4);
+ if (success) {
+ return pl_gpu_finalize(gpu);
+ } else {
+ d3d11_gpu_destroy(gpu);
+ return NULL;
+ }
+}