From ff6e3c025658a5fa1affd094f220b623e7e1b24b Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Mon, 15 Apr 2024 22:38:23 +0200
Subject: Adding upstream version 6.338.2.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/d3d11/gpu.c | 685 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 685 insertions(+)
 create mode 100644 src/d3d11/gpu.c

(limited to 'src/d3d11/gpu.c')

diff --git a/src/d3d11/gpu.c b/src/d3d11/gpu.c
new file mode 100644
index 0000000..05a08a3
--- /dev/null
+++ b/src/d3d11/gpu.c
@@ -0,0 +1,685 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <windows.h>
+#include <versionhelpers.h>
+
+#include "common.h"
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8)
+
+struct timer_query {
+    ID3D11Query *ts_start;
+    ID3D11Query *ts_end;
+    ID3D11Query *disjoint;
+};
+
+struct pl_timer_t {
+    // Ring buffer of timer queries to use
+    int current;
+    int pending;
+    struct timer_query queries[16];
+};
+
+void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    if (!timer)
+        return;
+    struct timer_query *query = &timer->queries[timer->current];
+
+    // Create the query objects lazilly
+    if (!query->ts_start) {
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start));
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end));
+
+        // Measuring duration in D3D11 requires three queries: start and end
+        // timestamp queries, and a disjoint query containing a flag which says
+        // whether the timestamps are usable or if a discontinuity occurred
+        // between them, like a change in power state or clock speed. The
+        // disjoint query also contains the timer frequency, so the timestamps
+        // are useless without it.
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint));
+    }
+
+    // Query the start timestamp
+    ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint);
+    ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start);
+    return;
+
+error:
+    SAFE_RELEASE(query->ts_start);
+    SAFE_RELEASE(query->ts_end);
+    SAFE_RELEASE(query->disjoint);
+}
+
+void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+    if (!timer)
+        return;
+    struct timer_query *query = &timer->queries[timer->current];
+
+    // Even if timer_start and timer_end are called in-order, timer_start might
+    // have failed to create the timer objects
+    if (!query->ts_start)
+        return;
+
+    // Query the end timestamp
+    ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end);
+    ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint);
+
+    // Advance to the next set of queries, for the next call to timer_start
+    timer->current++;
+    if (timer->current >= PL_ARRAY_SIZE(timer->queries))
+        timer->current = 0; // Wrap around
+
+    // Increment the number of pending queries, unless the ring buffer is full,
+    // in which case, timer->current now points to the oldest one, which will be
+    // dropped and reused
+    if (timer->pending < PL_ARRAY_SIZE(timer->queries))
+        timer->pending++;
+}
+
+static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
+{
+    static const uint64_t ns_per_s = 1000000000llu;
+    return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
+}
+
+static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    HRESULT hr;
+
+    for (; timer->pending > 0; timer->pending--) {
+        int index = timer->current - timer->pending;
+        if (index < 0)
+            index += PL_ARRAY_SIZE(timer->queries);
+        struct timer_query *query = &timer->queries[index];
+
+        UINT64 start, end;
+        D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
+
+        // Fetch the results of each query, or on S_FALSE, return 0 to indicate
+        // the queries are still pending
+        D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+            (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj),
+            D3D11_ASYNC_GETDATA_DONOTFLUSH));
+        if (hr == S_FALSE)
+            return 0;
+        D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+            (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end),
+            D3D11_ASYNC_GETDATA_DONOTFLUSH));
+        if (hr == S_FALSE)
+            return 0;
+        D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+            (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start),
+            D3D11_ASYNC_GETDATA_DONOTFLUSH));
+        if (hr == S_FALSE)
+            return 0;
+
+        // There was a discontinuity during the queries, so a timestamp can't be
+        // produced. Skip it and try the next one.
+        if (dj.Disjoint || !dj.Frequency)
+            continue;
+
+        // We got a result. Return it to the caller.
+        timer->pending--;
+        pl_d3d11_flush_message_queue(ctx, "After timer query");
+
+        uint64_t ns = timestamp_to_ns(end - start, dj.Frequency);
+        return PL_MAX(ns, 1);
+
+    error:
+        // There was an error fetching the timer result, so skip it and try the
+        // next one
+        continue;
+    }
+
+    // No more unprocessed results
+    return 0;
+}
+
+static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) {
+        SAFE_RELEASE(timer->queries[i].ts_start);
+        SAFE_RELEASE(timer->queries[i].ts_end);
+        SAFE_RELEASE(timer->queries[i].disjoint);
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After timer destroy");
+
+    pl_free(timer);
+}
+
+static pl_timer d3d11_timer_create(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    if (!p->has_timestamp_queries)
+        return NULL;
+
+    struct pl_timer_t *timer = pl_alloc_ptr(NULL, timer);
+    *timer = (struct pl_timer_t) {0};
+    return timer;
+}
+
+static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    // Vulkan-style binding, where all descriptors are in the same namespace, is
+    // required to use SPIRV-Cross' HLSL resource mapping API, which targets
+    // resources by binding number
+    return 0;
+}
+
+static void d3d11_gpu_flush(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    ID3D11DeviceContext_Flush(p->imm);
+
+    pl_d3d11_flush_message_queue(ctx, "After gpu flush");
+}
+
+static void d3d11_gpu_finish(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    HRESULT hr;
+
+    if (p->finish_fence) {
+        p->finish_value++;
+        D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value,
+                                             p->finish_event));
+        ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value);
+        ID3D11DeviceContext_Flush(p->imm);
+        WaitForSingleObject(p->finish_event, INFINITE);
+    } else {
+        ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query);
+
+        // D3D11 doesn't have blocking queries, but it does have blocking
+        // readback. As a performance hack to try to avoid polling, do a dummy
+        // copy/readback between two buffers. Hopefully this will block until
+        // all prior commands are finished. If it does, the first GetData call
+        // will return a result and we won't have to poll.
+        pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t));
+        pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t));
+
+        // Poll the event query until it completes
+        for (;;) {
+            BOOL idle;
+            D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+                (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0));
+            if (hr == S_OK && idle)
+                break;
+            Sleep(1);
+        }
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After gpu finish");
+
+error:
+    return;
+}
+
+static bool d3d11_gpu_is_failed(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    if (ctx->is_failed)
+        return true;
+
+    // GetDeviceRemovedReason returns S_OK if the device isn't removed
+    HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev);
+    if (FAILED(hr)) {
+        ctx->is_failed = true;
+        pl_d3d11_after_error(ctx, hr);
+    }
+
+    return ctx->is_failed;
+}
+
+static void d3d11_gpu_destroy(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+    pl_buf_destroy(gpu, &p->finish_buf_src);
+    pl_buf_destroy(gpu, &p->finish_buf_dst);
+
+    // Release everything except the immediate context
+    SAFE_RELEASE(p->dev);
+    SAFE_RELEASE(p->dev1);
+    SAFE_RELEASE(p->dev5);
+    SAFE_RELEASE(p->imm1);
+    SAFE_RELEASE(p->imm4);
+    SAFE_RELEASE(p->vbuf.buf);
+    SAFE_RELEASE(p->ibuf.buf);
+    SAFE_RELEASE(p->rstate);
+    SAFE_RELEASE(p->dsstate);
+    for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) {
+        for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) {
+            SAFE_RELEASE(p->samplers[i][j]);
+        }
+    }
+    SAFE_RELEASE(p->finish_fence);
+    if (p->finish_event)
+        CloseHandle(p->finish_event);
+    SAFE_RELEASE(p->finish_query);
+
+    // Destroy the immediate context synchronously so referenced objects don't
+    // show up in the leak check
+    if (p->imm) {
+        ID3D11DeviceContext_ClearState(p->imm);
+        ID3D11DeviceContext_Flush(p->imm);
+        SAFE_RELEASE(p->imm);
+    }
+
+    pl_spirv_destroy(&p->spirv);
+    pl_free((void *) gpu);
+}
+
+pl_d3d11 pl_d3d11_get(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->destroy == d3d11_gpu_destroy) {
+        struct pl_gpu_d3d11 *p = (struct pl_gpu_d3d11 *) impl;
+        return p->ctx->d3d11;
+    }
+
+    return NULL;
+}
+
+static bool load_d3d_compiler(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    HMODULE d3dcompiler = NULL;
+
+    static const struct {
+        const wchar_t *name;
+        bool inbox;
+    } compiler_dlls[] = {
+        // Try the inbox D3DCompiler first (Windows 8.1 and up)
+        { .name = L"d3dcompiler_47.dll", .inbox = true },
+        // Check for a packaged version of d3dcompiler_47.dll
+        { .name = L"d3dcompiler_47.dll" },
+        // Try d3dcompiler_46.dll from the Windows 8 SDK
+        { .name = L"d3dcompiler_46.dll" },
+        // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
+        { .name = L"d3dcompiler_43.dll" },
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) {
+        if (compiler_dlls[i].inbox) {
+            if (!IsWindows8Point1OrGreater())
+                continue;
+            d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL,
+                                         LOAD_LIBRARY_SEARCH_SYSTEM32);
+        } else {
+            d3dcompiler = LoadLibraryW(compiler_dlls[i].name);
+        }
+        if (!d3dcompiler)
+            continue;
+
+        p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile");
+        if (!p->D3DCompile)
+            return false;
+        p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name);
+
+        return true;
+    }
+
+    return false;
+}
+
+static struct pl_gpu_fns pl_fns_d3d11 = {
+    .tex_create             = pl_d3d11_tex_create,
+    .tex_destroy            = pl_d3d11_tex_destroy,
+    .tex_invalidate         = pl_d3d11_tex_invalidate,
+    .tex_clear_ex           = pl_d3d11_tex_clear_ex,
+    .tex_blit               = pl_d3d11_tex_blit,
+    .tex_upload             = pl_d3d11_tex_upload,
+    .tex_download           = pl_d3d11_tex_download,
+    .buf_create             = pl_d3d11_buf_create,
+    .buf_destroy            = pl_d3d11_buf_destroy,
+    .buf_write              = pl_d3d11_buf_write,
+    .buf_read               = pl_d3d11_buf_read,
+    .buf_copy               = pl_d3d11_buf_copy,
+    .desc_namespace         = d3d11_desc_namespace,
+    .pass_create            = pl_d3d11_pass_create,
+    .pass_destroy           = pl_d3d11_pass_destroy,
+    .pass_run               = pl_d3d11_pass_run,
+    .timer_create           = d3d11_timer_create,
+    .timer_destroy          = d3d11_timer_destroy,
+    .timer_query            = d3d11_timer_query,
+    .gpu_flush              = d3d11_gpu_flush,
+    .gpu_finish             = d3d11_gpu_finish,
+    .gpu_is_failed          = d3d11_gpu_is_failed,
+    .destroy                = d3d11_gpu_destroy,
+};
+
+pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx)
+{
+    pl_assert(ctx->dev);
+    IDXGIDevice1 *dxgi_dev = NULL;
+    IDXGIAdapter1 *adapter = NULL;
+    IDXGIAdapter4 *adapter4 = NULL;
+    bool success = false;
+    HRESULT hr;
+
+    struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11);
+    gpu->log = ctx->log;
+
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    uint32_t spirv_ver = PL_MIN(SPV_VERSION, PL_MAX_SPIRV_VER);
+    *p = (struct pl_gpu_d3d11) {
+        .ctx = ctx,
+        .impl = pl_fns_d3d11,
+        .dev = ctx->dev,
+        .spirv = pl_spirv_create(ctx->log, (struct pl_spirv_version) {
+            .env_version = pl_spirv_version_to_vulkan(spirv_ver),
+            .spv_version = spirv_ver,
+        }),
+        .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER,
+        .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER,
+    };
+    if (!p->spirv)
+        goto error;
+
+    ID3D11Device_AddRef(p->dev);
+    ID3D11Device_GetImmediateContext(p->dev, &p->imm);
+
+    // Check D3D11.1 interfaces
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
+                                     (void **) &p->dev1);
+    if (SUCCEEDED(hr)) {
+        p->minor = 1;
+        ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1);
+    }
+
+    // Check D3D11.4 interfaces
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5,
+                                     (void **) &p->dev5);
+    if (SUCCEEDED(hr)) {
+        // There is no GetImmediateContext4 method
+        hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4,
+                                                (void **) &p->imm4);
+        if (SUCCEEDED(hr))
+            p->minor = 4;
+    }
+
+    PL_INFO(gpu, "Using Direct3D 11.%d runtime", p->minor);
+
+    D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev));
+    D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
+
+    DXGI_ADAPTER_DESC1 adapter_desc = {0};
+    IDXGIAdapter1_GetDesc1(adapter, &adapter_desc);
+
+    // No resource can be larger than max_res_size in bytes
+    unsigned int max_res_size = PL_CLAMP(
+        D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory,
+        D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u,
+        D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u);
+
+    gpu->glsl = (struct pl_glsl_version) {
+        .version = 450,
+        .vulkan = true,
+    };
+
+    gpu->limits = (struct pl_gpu_limits) {
+        .max_buf_size = max_res_size,
+        .max_ssbo_size = max_res_size,
+        .max_vbo_size = max_res_size,
+        .align_vertex_stride = 1,
+
+        // Make up some values
+        .align_tex_xfer_offset = 32,
+        .align_tex_xfer_pitch = 1,
+        .fragment_queues = 1,
+    };
+
+    p->fl = ID3D11Device_GetFeatureLevel(p->dev);
+
+    // If we're not using FL9_x, we can use the same suballocated buffer as a
+    // vertex buffer and index buffer
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0)
+        p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER;
+
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM;
+    } else {
+        // 10level9 restriction:
+        // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+        gpu->limits.max_ubo_size = 255 * CBUF_ELEM;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION;
+        gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION;
+        gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
+        gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        // Same limit as FL9_1
+        gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    } else {
+        gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        gpu->limits.max_buffer_texels =
+            1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        gpu->glsl.compute = true;
+        gpu->limits.compute_queues = 1;
+        // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which
+        // is used to emulate blits on 11_0 and up, supports 1D and 3D textures
+        gpu->limits.blittable_1d_3d = true;
+
+        gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float);
+        gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP;
+        gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X;
+        gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y;
+        gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z;
+        gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] =
+            gpu->limits.max_dispatch[2] =
+            D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        // The offset limits are defined by HLSL:
+        // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm-
+        gpu->glsl.min_gather_offset = -32;
+        gpu->glsl.max_gather_offset = 31;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+        // SM4.1 has no gather4_po, so the offset must be specified by an
+        // immediate with a range of [-8, 7]
+        // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm-
+        // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset
+        gpu->glsl.min_gather_offset = -8;
+        gpu->glsl.max_gather_offset = 7;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT;
+    } else {
+        // 10level9 restriction:
+        // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+        p->max_srvs = 8;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
+        p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
+    } else {
+        p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
+    }
+
+    if (!load_d3d_compiler(gpu)) {
+        PL_FATAL(gpu, "Could not find D3DCompiler DLL");
+        goto error;
+    }
+    PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u",
+            p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
+            p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
+
+    // Detect support for timestamp queries. Some FL9_x devices don't support them.
+    hr = ID3D11Device_CreateQuery(p->dev,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
+    p->has_timestamp_queries = SUCCEEDED(hr);
+
+    pl_d3d11_setup_formats(gpu);
+
+    // The rasterizer state never changes, so create it here
+    D3D11_RASTERIZER_DESC rdesc = {
+        .FillMode = D3D11_FILL_SOLID,
+        .CullMode = D3D11_CULL_NONE,
+        .FrontCounterClockwise = FALSE,
+        .DepthClipEnable = TRUE, // Required for 10level9
+        .ScissorEnable = TRUE,
+    };
+    D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate));
+
+    // The depth stencil state never changes either, and we only set it to turn
+    // depth testing off so the debug layer doesn't complain about an unbound
+    // depth buffer
+    D3D11_DEPTH_STENCIL_DESC dsdesc = {
+        .DepthEnable = FALSE,
+        .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL,
+        .DepthFunc = D3D11_COMPARISON_LESS,
+        .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK,
+        .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK,
+        .FrontFace = {
+            .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+            .StencilFunc = D3D11_COMPARISON_ALWAYS,
+        },
+        .BackFace = {
+            .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+            .StencilFunc = D3D11_COMPARISON_ALWAYS,
+        },
+    };
+    D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate));
+
+    // Initialize the samplers
+    for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) {
+        for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) {
+            static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = {
+                [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP,
+                [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP,
+                [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR,
+            };
+            static const D3D11_FILTER d3d_filter[] = {
+                [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT,
+                [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
+            };
+
+            D3D11_SAMPLER_DESC sdesc = {
+                .AddressU = d3d_address_mode[address_mode],
+                .AddressV = d3d_address_mode[address_mode],
+                .AddressW = d3d_address_mode[address_mode],
+                .ComparisonFunc = D3D11_COMPARISON_NEVER,
+                .MinLOD = 0,
+                .MaxLOD = D3D11_FLOAT32_MAX,
+                .MaxAnisotropy = 1,
+                .Filter = d3d_filter[sample_mode],
+            };
+            D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc,
+                &p->samplers[sample_mode][address_mode]));
+        }
+    }
+
+    hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4,
+                                      (void **) &adapter4);
+    if (SUCCEEDED(hr)) {
+        DXGI_ADAPTER_DESC3 adapter_desc3 = {0};
+        IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3);
+
+        p->has_monitored_fences =
+            adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES;
+    }
+
+    // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish()
+    if (p->dev5 && p->has_monitored_fences) {
+        hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE,
+                                       &IID_ID3D11Fence,
+                                       (void **) &p->finish_fence);
+        if (SUCCEEDED(hr)) {
+            p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL);
+            if (!p->finish_event) {
+                PL_ERR(gpu, "Failed to create finish() event");
+                goto error;
+            }
+        }
+    }
+
+    // If fences are not available, we will have to poll a event query instead
+    if (!p->finish_fence) {
+        // Buffers for dummy copy/readback (see d3d11_gpu_finish())
+        p->finish_buf_src = pl_buf_create(gpu, pl_buf_params(
+            .size = sizeof(uint32_t),
+            .drawable = true, // Make these vertex buffers for 10level9
+            .initial_data = &(uint32_t) {0x11223344},
+        ));
+        p->finish_buf_dst = pl_buf_create(gpu, pl_buf_params(
+            .size = sizeof(uint32_t),
+            .host_readable = true,
+            .drawable = true,
+        ));
+
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query));
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After gpu create");
+
+    success = true;
+error:
+    SAFE_RELEASE(dxgi_dev);
+    SAFE_RELEASE(adapter);
+    SAFE_RELEASE(adapter4);
+    if (success) {
+        return pl_gpu_finalize(gpu);
+    } else {
+        d3d11_gpu_destroy(gpu);
+        return NULL;
+    }
+}
-- 
cgit v1.2.3