23 files changed, 889 insertions, 163 deletions
diff --git a/third_party/rust/wgpu-hal/src/auxil/dxgi/conv.rs b/third_party/rust/wgpu-hal/src/auxil/dxgi/conv.rs
index 6af4b77bb3..e5162362f7 100644
--- a/third_party/rust/wgpu-hal/src/auxil/dxgi/conv.rs
+++ b/third_party/rust/wgpu-hal/src/auxil/dxgi/conv.rs
@@ -261,6 +261,7 @@ pub fn map_vertex_format(format: wgt::VertexFormat) -> dxgiformat::DXGI_FORMAT {
         Vf::Uint32x4 => DXGI_FORMAT_R32G32B32A32_UINT,
         Vf::Sint32x4 => DXGI_FORMAT_R32G32B32A32_SINT,
         Vf::Float32x4 => DXGI_FORMAT_R32G32B32A32_FLOAT,
+        Vf::Unorm10_10_10_2 => DXGI_FORMAT_R10G10B10A2_UNORM,
         Vf::Float64 | Vf::Float64x2 | Vf::Float64x3 | Vf::Float64x4 => unimplemented!(),
     }
 }
diff --git a/third_party/rust/wgpu-hal/src/dx12/adapter.rs b/third_party/rust/wgpu-hal/src/dx12/adapter.rs
index b417a88a6f..faf25cc852 100644
--- a/third_party/rust/wgpu-hal/src/dx12/adapter.rs
+++ b/third_party/rust/wgpu-hal/src/dx12/adapter.rs
@@ -115,18 +115,6 @@ impl super::Adapter {
             )
         });
 
-        let mut shader_model_support: d3d12_ty::D3D12_FEATURE_DATA_SHADER_MODEL =
-            d3d12_ty::D3D12_FEATURE_DATA_SHADER_MODEL {
-                HighestShaderModel: d3d12_ty::D3D_SHADER_MODEL_6_0,
-            };
-        assert_eq!(0, unsafe {
-            device.CheckFeatureSupport(
-                d3d12_ty::D3D12_FEATURE_SHADER_MODEL,
-                &mut shader_model_support as *mut _ as *mut _,
-                mem::size_of::<d3d12_ty::D3D12_FEATURE_DATA_SHADER_MODEL>() as _,
-            )
-        });
-
         let mut workarounds = super::Workarounds::default();
 
         let info = wgt::AdapterInfo {
@@ -181,6 +169,53 @@ impl super::Adapter {
             hr == 0 && features3.CastingFullyTypedFormatSupported != 0
         };
 
+        let shader_model = if dxc_container.is_none() {
+            naga::back::hlsl::ShaderModel::V5_1
+        } else {
+            let mut versions = [
+                crate::dx12::types::D3D_SHADER_MODEL_6_7,
+                crate::dx12::types::D3D_SHADER_MODEL_6_6,
+                crate::dx12::types::D3D_SHADER_MODEL_6_5,
+                crate::dx12::types::D3D_SHADER_MODEL_6_4,
+                crate::dx12::types::D3D_SHADER_MODEL_6_3,
+                crate::dx12::types::D3D_SHADER_MODEL_6_2,
+                crate::dx12::types::D3D_SHADER_MODEL_6_1,
+                crate::dx12::types::D3D_SHADER_MODEL_6_0,
+                crate::dx12::types::D3D_SHADER_MODEL_5_1,
+            ]
+            .iter();
+            match loop {
+                if let Some(&sm) = versions.next() {
+                    let mut sm = crate::dx12::types::D3D12_FEATURE_DATA_SHADER_MODEL {
+                        HighestShaderModel: sm,
+                    };
+                    if 0 == unsafe {
+                        device.CheckFeatureSupport(
+                            7, // D3D12_FEATURE_SHADER_MODEL
+                            &mut sm as *mut _ as *mut _,
+                            mem::size_of::<crate::dx12::types::D3D12_FEATURE_DATA_SHADER_MODEL>()
+                                as _,
+                        )
+                    } {
+                        break sm.HighestShaderModel;
+                    }
+                } else {
+                    break crate::dx12::types::D3D_SHADER_MODEL_5_1;
+                }
+            } {
+                crate::dx12::types::D3D_SHADER_MODEL_5_1 => naga::back::hlsl::ShaderModel::V5_1,
+                crate::dx12::types::D3D_SHADER_MODEL_6_0 => naga::back::hlsl::ShaderModel::V6_0,
+                crate::dx12::types::D3D_SHADER_MODEL_6_1 => naga::back::hlsl::ShaderModel::V6_1,
+                crate::dx12::types::D3D_SHADER_MODEL_6_2 => naga::back::hlsl::ShaderModel::V6_2,
+                crate::dx12::types::D3D_SHADER_MODEL_6_3 => naga::back::hlsl::ShaderModel::V6_3,
+                crate::dx12::types::D3D_SHADER_MODEL_6_4 => naga::back::hlsl::ShaderModel::V6_4,
+                crate::dx12::types::D3D_SHADER_MODEL_6_5 => naga::back::hlsl::ShaderModel::V6_5,
+                crate::dx12::types::D3D_SHADER_MODEL_6_6 => naga::back::hlsl::ShaderModel::V6_6,
+                crate::dx12::types::D3D_SHADER_MODEL_6_7 => naga::back::hlsl::ShaderModel::V6_7,
+                _ => unreachable!(),
+            }
+        };
+
         let private_caps = super::PrivateCapabilities {
             instance_flags,
             heterogeneous_resource_heaps: options.ResourceHeapTier
@@ -196,6 +231,7 @@ impl super::Adapter {
             casting_fully_typed_format_supported,
             // See https://github.com/gfx-rs/wgpu/issues/3552
             suballocation_supported: !info.name.contains("Iris(R) Xe"),
+            shader_model,
         };
 
         // Theoretically vram limited, but in practice 2^20 is the limit
@@ -273,7 +309,7 @@ impl super::Adapter {
             wgt::Features::TEXTURE_BINDING_ARRAY
                 | wgt::Features::UNIFORM_BUFFER_AND_STORAGE_TEXTURE_ARRAY_NON_UNIFORM_INDEXING
                 | wgt::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
-            shader_model_support.HighestShaderModel >= d3d12_ty::D3D_SHADER_MODEL_5_1,
+            shader_model >= naga::back::hlsl::ShaderModel::V5_1,
         );
 
         let bgra8unorm_storage_supported = {
@@ -295,21 +331,28 @@ impl super::Adapter {
             bgra8unorm_storage_supported,
         );
 
-        // we must be using DXC because uint64_t was added with Shader Model 6
-        // and FXC only supports up to 5.1
-        let int64_shader_ops_supported = dxc_container.is_some() && {
-            let mut features1: d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1 =
-                unsafe { mem::zeroed() };
-            let hr = unsafe {
-                device.CheckFeatureSupport(
-                    d3d12_ty::D3D12_FEATURE_D3D12_OPTIONS1,
-                    &mut features1 as *mut _ as *mut _,
-                    mem::size_of::<d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1>() as _,
-                )
-            };
-            hr == 0 && features1.Int64ShaderOps != 0
+        let mut features1: d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1 = unsafe { mem::zeroed() };
+        let hr = unsafe {
+            device.CheckFeatureSupport(
+                d3d12_ty::D3D12_FEATURE_D3D12_OPTIONS1,
+                &mut features1 as *mut _ as *mut _,
+                mem::size_of::<d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1>() as _,
+            )
         };
-        features.set(wgt::Features::SHADER_INT64, int64_shader_ops_supported);
+
+        features.set(
+            wgt::Features::SHADER_INT64,
+            shader_model >= naga::back::hlsl::ShaderModel::V6_0
+                && hr == 0
+                && features1.Int64ShaderOps != 0,
+        );
+
+        features.set(
+            wgt::Features::SUBGROUP,
+            shader_model >= naga::back::hlsl::ShaderModel::V6_0
+                && hr == 0
+                && features1.WaveOps != 0,
+        );
 
         // float32-filterable should always be available on d3d12
         features.set(wgt::Features::FLOAT32_FILTERABLE, true);
@@ -377,6 +420,8 @@ impl super::Adapter {
                         .min(crate::MAX_VERTEX_BUFFERS as u32),
                     max_vertex_attributes: d3d12_ty::D3D12_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT,
                     max_vertex_buffer_array_stride: d3d12_ty::D3D12_SO_BUFFER_MAX_STRIDE_IN_BYTES,
+                    min_subgroup_size: 4, // Not using `features1.WaveLaneCountMin` as it is unreliable
+                    max_subgroup_size: 128,
                     // The push constants are part of the root signature which
                     // has a limit of 64 DWORDS (256 bytes), but other resources
                     // also share the root signature:
diff --git a/third_party/rust/wgpu-hal/src/dx12/conv.rs b/third_party/rust/wgpu-hal/src/dx12/conv.rs
index 2b6c1d959e..b09ea76080 100644
--- a/third_party/rust/wgpu-hal/src/dx12/conv.rs
+++ b/third_party/rust/wgpu-hal/src/dx12/conv.rs
@@ -224,7 +224,7 @@ pub fn map_polygon_mode(mode: wgt::PolygonMode) -> d3d12_ty::D3D12_FILL_MODE {
 }
 
 /// D3D12 doesn't support passing factors ending in `_COLOR` for alpha blending
-/// (see https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_render_target_blend_desc).
+/// (see <https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_render_target_blend_desc>).
 /// Therefore this function takes an additional `is_alpha` argument
 /// which if set will return an equivalent `_ALPHA` factor.
 fn map_blend_factor(factor: wgt::BlendFactor, is_alpha: bool) -> d3d12_ty::D3D12_BLEND {
diff --git a/third_party/rust/wgpu-hal/src/dx12/device.rs b/third_party/rust/wgpu-hal/src/dx12/device.rs
index 23bd409dc4..82075294ee 100644
--- a/third_party/rust/wgpu-hal/src/dx12/device.rs
+++ b/third_party/rust/wgpu-hal/src/dx12/device.rs
@@ -218,21 +218,39 @@ impl super::Device {
         use naga::back::hlsl;
 
         let stage_bit = crate::auxil::map_naga_stage(naga_stage);
-        let module = &stage.module.naga.module;
+
+        let (module, info) = naga::back::pipeline_constants::process_overrides(
+            &stage.module.naga.module,
+            &stage.module.naga.info,
+            stage.constants,
+        )
+        .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("HLSL: {e:?}")))?;
+
+        let needs_temp_options = stage.zero_initialize_workgroup_memory
+            != layout.naga_options.zero_initialize_workgroup_memory;
+        let mut temp_options;
+        let naga_options = if needs_temp_options {
+            temp_options = layout.naga_options.clone();
+            temp_options.zero_initialize_workgroup_memory = stage.zero_initialize_workgroup_memory;
+            &temp_options
+        } else {
+            &layout.naga_options
+        };
+
         //TODO: reuse the writer
         let mut source = String::new();
-        let mut writer = hlsl::Writer::new(&mut source, &layout.naga_options);
+        let mut writer = hlsl::Writer::new(&mut source, naga_options);
         let reflection_info = {
             profiling::scope!("naga::back::hlsl::write");
             writer
-                .write(module, &stage.module.naga.info)
+                .write(&module, &info)
                 .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("HLSL: {e:?}")))?
         };
 
         let full_stage = format!(
             "{}_{}\0",
             naga_stage.to_hlsl_str(),
-            layout.naga_options.shader_model.to_str()
+            naga_options.shader_model.to_str()
         );
 
         let ep_index = module
@@ -1062,12 +1080,7 @@ impl crate::Device for super::Device {
             },
             bind_group_infos,
             naga_options: hlsl::Options {
-                shader_model: match self.dxc_container {
-                    // DXC
-                    Some(_) => hlsl::ShaderModel::V6_0,
-                    // FXC doesn't support SM 6.0
-                    None => hlsl::ShaderModel::V5_1,
-                },
+                shader_model: self.private_caps.shader_model,
                 binding_map,
                 fake_missing_bindings: false,
                 special_constants_binding,
diff --git a/third_party/rust/wgpu-hal/src/dx12/mod.rs b/third_party/rust/wgpu-hal/src/dx12/mod.rs
index 4f958943ca..9f021bc241 100644
--- a/third_party/rust/wgpu-hal/src/dx12/mod.rs
+++ b/third_party/rust/wgpu-hal/src/dx12/mod.rs
@@ -195,6 +195,7 @@ struct PrivateCapabilities {
     heap_create_not_zeroed: bool,
     casting_fully_typed_format_supported: bool,
     suballocation_supported: bool,
+    shader_model: naga::back::hlsl::ShaderModel,
 }
 
 #[derive(Default)]
@@ -439,7 +440,7 @@ impl Texture {
         }
     }
 
-    /// see https://learn.microsoft.com/en-us/windows/win32/direct3d12/subresources#plane-slice
+    /// see <https://learn.microsoft.com/en-us/windows/win32/direct3d12/subresources#plane-slice>
     fn calc_subresource(&self, mip_level: u32, array_layer: u32, plane: u32) -> u32 {
         mip_level + (array_layer + plane * self.array_layer_count()) * self.mip_level_count
     }
diff --git a/third_party/rust/wgpu-hal/src/dx12/types.rs b/third_party/rust/wgpu-hal/src/dx12/types.rs
index b4ad38324a..17b608b840 100644
--- a/third_party/rust/wgpu-hal/src/dx12/types.rs
+++ b/third_party/rust/wgpu-hal/src/dx12/types.rs
@@ -41,3 +41,25 @@ winapi::STRUCT! {
         BarycentricsSupported: winapi::shared::minwindef::BOOL,
     }
 }
+
+winapi::ENUM! {
+    enum D3D_SHADER_MODEL {
+        D3D_SHADER_MODEL_NONE = 0,
+        D3D_SHADER_MODEL_5_1 = 0x51,
+        D3D_SHADER_MODEL_6_0 = 0x60,
+        D3D_SHADER_MODEL_6_1 = 0x61,
+        D3D_SHADER_MODEL_6_2 = 0x62,
+        D3D_SHADER_MODEL_6_3 = 0x63,
+        D3D_SHADER_MODEL_6_4 = 0x64,
+        D3D_SHADER_MODEL_6_5 = 0x65,
+        D3D_SHADER_MODEL_6_6 = 0x66,
+        D3D_SHADER_MODEL_6_7 = 0x67,
+        D3D_HIGHEST_SHADER_MODEL = 0x67,
+    }
+}
+
+winapi::STRUCT! {
+    struct D3D12_FEATURE_DATA_SHADER_MODEL {
+        HighestShaderModel: D3D_SHADER_MODEL,
+    }
+}
diff --git a/third_party/rust/wgpu-hal/src/gles/adapter.rs b/third_party/rust/wgpu-hal/src/gles/adapter.rs
index b9d044337c..052c77006b 100644
--- a/third_party/rust/wgpu-hal/src/gles/adapter.rs
+++ b/third_party/rust/wgpu-hal/src/gles/adapter.rs
@@ -104,7 +104,7 @@ impl super::Adapter {
         }
     }
 
-    fn make_info(vendor_orig: String, renderer_orig: String) -> wgt::AdapterInfo {
+    fn make_info(vendor_orig: String, renderer_orig: String, version: String) -> wgt::AdapterInfo {
         let vendor = vendor_orig.to_lowercase();
         let renderer = renderer_orig.to_lowercase();
 
@@ -179,13 +179,33 @@ impl super::Adapter {
             0
         };
 
+        let driver;
+        let driver_info;
+        if version.starts_with("WebGL ") || version.starts_with("OpenGL ") {
+            let es_sig = " ES";
+            match version.find(es_sig) {
+                Some(pos) => {
+                    driver = version[..pos + es_sig.len()].to_owned();
+                    driver_info = version[pos + es_sig.len() + 1..].to_owned();
+                }
+                None => {
+                    let pos = version.find(' ').unwrap();
+                    driver = version[..pos].to_owned();
+                    driver_info = version[pos + 1..].to_owned();
+                }
+            }
+        } else {
+            driver = "OpenGL".to_owned();
+            driver_info = version;
+        }
+
         wgt::AdapterInfo {
             name: renderer_orig,
             vendor: vendor_id,
             device: 0,
             device_type: inferred_device_type,
-            driver: String::new(),
-            driver_info: String::new(),
+            driver,
+            driver_info,
             backend: wgt::Backend::Gl,
         }
     }
@@ -507,8 +527,7 @@ impl super::Adapter {
         let has_etc = if cfg!(any(webgl, Emscripten)) {
             extensions.contains("WEBGL_compressed_texture_etc")
         } else {
-            // This is a required part of GLES3, but not part of Desktop GL at all.
-            es_ver.is_some()
+            es_ver.is_some() || extensions.contains("GL_ARB_ES3_compatibility")
         };
         features.set(wgt::Features::TEXTURE_COMPRESSION_ETC2, has_etc);
 
@@ -728,6 +747,8 @@ impl super::Adapter {
             } else {
                 !0
             },
+            min_subgroup_size: 0,
+            max_subgroup_size: 0,
             max_push_constant_size: super::MAX_PUSH_CONSTANTS as u32 * 4,
             min_uniform_buffer_offset_alignment,
             min_storage_buffer_offset_alignment,
@@ -825,7 +846,7 @@ impl super::Adapter {
                     max_msaa_samples: max_samples,
                 }),
             },
-            info: Self::make_info(vendor, renderer),
+            info: Self::make_info(vendor, renderer, version),
             features,
             capabilities: crate::Capabilities {
                 limits,
diff --git a/third_party/rust/wgpu-hal/src/gles/conv.rs b/third_party/rust/wgpu-hal/src/gles/conv.rs
index bde69b8629..a6c924f162 100644
--- a/third_party/rust/wgpu-hal/src/gles/conv.rs
+++ b/third_party/rust/wgpu-hal/src/gles/conv.rs
@@ -212,6 +212,7 @@ pub(super) fn describe_vertex_format(vertex_format: wgt::VertexFormat) -> super:
         Vf::Uint32x4 => (4, glow::UNSIGNED_INT, Vak::Integer),
         Vf::Sint32x4 => (4, glow::INT, Vak::Integer),
         Vf::Float32x4 => (4, glow::FLOAT, Vak::Float),
+        Vf::Unorm10_10_10_2 => (4, glow::UNSIGNED_INT_10_10_10_2, Vak::Float),
         Vf::Float64 | Vf::Float64x2 | Vf::Float64x3 | Vf::Float64x4 => unimplemented!(),
     };
 
diff --git a/third_party/rust/wgpu-hal/src/gles/device.rs b/third_party/rust/wgpu-hal/src/gles/device.rs
index 50c07f3ff0..a1e2736aa6 100644
--- a/third_party/rust/wgpu-hal/src/gles/device.rs
+++ b/third_party/rust/wgpu-hal/src/gles/device.rs
@@ -220,9 +220,17 @@ impl super::Device {
             multiview: context.multiview,
         };
 
-        let shader = &stage.module.naga;
-        let entry_point_index = shader
-            .module
+        let (module, info) = naga::back::pipeline_constants::process_overrides(
+            &stage.module.naga.module,
+            &stage.module.naga.info,
+            stage.constants,
+        )
+        .map_err(|e| {
+            let msg = format!("{e}");
+            crate::PipelineError::Linkage(map_naga_stage(naga_stage), msg)
+        })?;
+
+        let entry_point_index = module
             .entry_points
             .iter()
             .position(|ep| ep.name.as_str() == stage.entry_point)
@@ -247,11 +255,23 @@ impl super::Device {
         };
 
         let mut output = String::new();
+        let needs_temp_options = stage.zero_initialize_workgroup_memory
+            != context.layout.naga_options.zero_initialize_workgroup_memory;
+        let mut temp_options;
+        let naga_options = if needs_temp_options {
+            // We use a conditional here, as cloning the naga_options could be expensive
+            // That is, we want to avoid doing that unless we cannot avoid it
+            temp_options = context.layout.naga_options.clone();
+            temp_options.zero_initialize_workgroup_memory = stage.zero_initialize_workgroup_memory;
+            &temp_options
+        } else {
+            &context.layout.naga_options
+        };
         let mut writer = glsl::Writer::new(
             &mut output,
-            &shader.module,
-            &shader.info,
-            &context.layout.naga_options,
+            &module,
+            &info,
+            naga_options,
             &pipeline_options,
             policies,
         )
@@ -269,8 +289,8 @@ impl super::Device {
 
         context.consume_reflection(
             gl,
-            &shader.module,
-            shader.info.get_entry_point(entry_point_index),
+            &module,
+            info.get_entry_point(entry_point_index),
             reflection_info,
             naga_stage,
             program,
@@ -297,6 +317,7 @@ impl super::Device {
                 naga_stage: naga_stage.to_owned(),
                 shader_id: stage.module.id,
                 entry_point: stage.entry_point.to_owned(),
+                zero_initialize_workgroup_memory: stage.zero_initialize_workgroup_memory,
             });
         }
         let mut guard = self
diff --git a/third_party/rust/wgpu-hal/src/gles/egl.rs b/third_party/rust/wgpu-hal/src/gles/egl.rs
index b166f4f102..00ef70ba88 100644
--- a/third_party/rust/wgpu-hal/src/gles/egl.rs
+++ b/third_party/rust/wgpu-hal/src/gles/egl.rs
@@ -526,28 +526,51 @@ impl Inner {
         }
 
         let (config, supports_native_window) = choose_config(&egl, display, srgb_kind)?;
-        egl.bind_api(khronos_egl::OPENGL_ES_API).unwrap();
+
+        let supports_opengl = if version >= (1, 4) {
+            let client_apis = egl
+                .query_string(Some(display), khronos_egl::CLIENT_APIS)
+                .unwrap()
+                .to_string_lossy();
+            client_apis
+                .split(' ')
+                .any(|client_api| client_api == "OpenGL")
+        } else {
+            false
+        };
+        egl.bind_api(if supports_opengl {
+            khronos_egl::OPENGL_API
+        } else {
+            khronos_egl::OPENGL_ES_API
+        })
+        .unwrap();
 
         let needs_robustness = true;
         let mut khr_context_flags = 0;
         let supports_khr_context = display_extensions.contains("EGL_KHR_create_context");
 
-        //TODO: make it so `Device` == EGL Context
-        let mut context_attributes = vec![
-            khronos_egl::CONTEXT_MAJOR_VERSION,
-            3, // Request GLES 3.0 or higher
-        ];
-
-        if force_gles_minor_version != wgt::Gles3MinorVersion::Automatic {
+        let mut context_attributes = vec![];
+        if supports_opengl {
+            context_attributes.push(khronos_egl::CONTEXT_MAJOR_VERSION);
+            context_attributes.push(3);
             context_attributes.push(khronos_egl::CONTEXT_MINOR_VERSION);
-            context_attributes.push(match force_gles_minor_version {
-                wgt::Gles3MinorVersion::Version0 => 0,
-                wgt::Gles3MinorVersion::Version1 => 1,
-                wgt::Gles3MinorVersion::Version2 => 2,
-                _ => unreachable!(),
-            });
+            context_attributes.push(3);
+            if force_gles_minor_version != wgt::Gles3MinorVersion::Automatic {
+                log::warn!("Ignoring specified GLES minor version as OpenGL is used");
+            }
+        } else {
+            context_attributes.push(khronos_egl::CONTEXT_MAJOR_VERSION);
+            context_attributes.push(3); // Request GLES 3.0 or higher
+            if force_gles_minor_version != wgt::Gles3MinorVersion::Automatic {
+                context_attributes.push(khronos_egl::CONTEXT_MINOR_VERSION);
+                context_attributes.push(match force_gles_minor_version {
+                    wgt::Gles3MinorVersion::Automatic => unreachable!(),
+                    wgt::Gles3MinorVersion::Version0 => 0,
+                    wgt::Gles3MinorVersion::Version1 => 1,
+                    wgt::Gles3MinorVersion::Version2 => 2,
+                });
+            }
         }
-
         if flags.contains(wgt::InstanceFlags::DEBUG) {
             if version >= (1, 5) {
                 log::debug!("\tEGL context: +debug");
@@ -577,8 +600,6 @@ impl Inner {
                 // because it's for desktop GL only, not GLES.
                 log::warn!("\tEGL context: -robust access");
             }
-
-            //TODO do we need `khronos_egl::CONTEXT_OPENGL_NOTIFICATION_STRATEGY_EXT`?
         }
         if khr_context_flags != 0 {
             context_attributes.push(EGL_CONTEXT_FLAGS_KHR);
@@ -977,6 +998,7 @@ impl crate::Instance for Instance {
             srgb_kind: inner.srgb_kind,
         })
     }
+
     unsafe fn destroy_surface(&self, _surface: Surface) {}
 
     unsafe fn enumerate_adapters(&self) -> Vec<crate::ExposedAdapter<super::Api>> {
@@ -993,6 +1015,12 @@ impl crate::Instance for Instance {
             })
         };
 
+        // In contrast to OpenGL ES, OpenGL requires explicitly enabling sRGB conversions,
+        // as otherwise the user has to do the sRGB conversion.
+        if !matches!(inner.srgb_kind, SrgbFrameBufferKind::None) {
+            unsafe { gl.enable(glow::FRAMEBUFFER_SRGB) };
+        }
+
         if self.flags.contains(wgt::InstanceFlags::DEBUG) && gl.supports_debug() {
             log::debug!("Max label length: {}", unsafe {
                 gl.get_parameter_i32(glow::MAX_LABEL_LENGTH)
@@ -1106,6 +1134,13 @@ impl Surface {
 
         unsafe { gl.bind_framebuffer(glow::DRAW_FRAMEBUFFER, None) };
         unsafe { gl.bind_framebuffer(glow::READ_FRAMEBUFFER, Some(sc.framebuffer)) };
+
+        if !matches!(self.srgb_kind, SrgbFrameBufferKind::None) {
+            // Disable sRGB conversions for `glBlitFramebuffer` as behavior does diverge between
+            // drivers and formats otherwise and we want to ensure no sRGB conversions happen.
+            unsafe { gl.disable(glow::FRAMEBUFFER_SRGB) };
+        }
+
         // Note the Y-flipping here. GL's presentation is not flipped,
         // but main rendering is. Therefore, we Y-flip the output positions
         // in the shader, and also this blit.
@@ -1123,6 +1158,11 @@ impl Surface {
                 glow::NEAREST,
             )
         };
+
+        if !matches!(self.srgb_kind, SrgbFrameBufferKind::None) {
+            unsafe { gl.enable(glow::FRAMEBUFFER_SRGB) };
+        }
+
         unsafe { gl.bind_framebuffer(glow::READ_FRAMEBUFFER, None) };
 
         self.egl
diff --git a/third_party/rust/wgpu-hal/src/gles/mod.rs b/third_party/rust/wgpu-hal/src/gles/mod.rs
index 6f41f7c000..0fcb09be46 100644
--- a/third_party/rust/wgpu-hal/src/gles/mod.rs
+++ b/third_party/rust/wgpu-hal/src/gles/mod.rs
@@ -602,6 +602,7 @@ struct ProgramStage {
     naga_stage: naga::ShaderStage,
     shader_id: ShaderId,
     entry_point: String,
+    zero_initialize_workgroup_memory: bool,
 }
 
 #[derive(PartialEq, Eq, Hash)]
diff --git a/third_party/rust/wgpu-hal/src/gles/queue.rs b/third_party/rust/wgpu-hal/src/gles/queue.rs
index 29dfb79d04..7c728d3978 100644
--- a/third_party/rust/wgpu-hal/src/gles/queue.rs
+++ b/third_party/rust/wgpu-hal/src/gles/queue.rs
@@ -213,12 +213,27 @@ impl super::Queue {
                 instance_count,
                 ref first_instance_location,
             } => {
-                match base_vertex {
-                    0 => {
-                        unsafe {
-                            gl.uniform_1_u32(first_instance_location.as_ref(), first_instance)
-                        };
+                let supports_full_instancing = self
+                    .shared
+                    .private_caps
+                    .contains(PrivateCapabilities::FULLY_FEATURED_INSTANCING);
 
+                if supports_full_instancing {
+                    unsafe {
+                        gl.draw_elements_instanced_base_vertex_base_instance(
+                            topology,
+                            index_count as i32,
+                            index_type,
+                            index_offset as i32,
+                            instance_count as i32,
+                            base_vertex,
+                            first_instance,
+                        )
+                    }
+                } else {
+                    unsafe { gl.uniform_1_u32(first_instance_location.as_ref(), first_instance) };
+
+                    if base_vertex == 0 {
                         unsafe {
                             // Don't use `gl.draw_elements`/`gl.draw_elements_base_vertex` for `instance_count == 1`.
                             // Angle has a bug where it doesn't consider the instance divisor when `DYNAMIC_DRAW` is used in `gl.draw_elements`/`gl.draw_elements_base_vertex`.
@@ -231,41 +246,17 @@ impl super::Queue {
                                 instance_count as i32,
                             )
                         }
-                    }
-                    _ => {
-                        let supports_full_instancing = self
-                            .shared
-                            .private_caps
-                            .contains(PrivateCapabilities::FULLY_FEATURED_INSTANCING);
-
-                        if supports_full_instancing {
-                            unsafe {
-                                gl.draw_elements_instanced_base_vertex_base_instance(
-                                    topology,
-                                    index_count as i32,
-                                    index_type,
-                                    index_offset as i32,
-                                    instance_count as i32,
-                                    base_vertex,
-                                    first_instance,
-                                )
-                            }
-                        } else {
-                            unsafe {
-                                gl.uniform_1_u32(first_instance_location.as_ref(), first_instance)
-                            };
-
-                            // If we've gotten here, wgpu-core has already validated that this function exists via the DownlevelFlags::BASE_VERTEX feature.
-                            unsafe {
-                                gl.draw_elements_instanced_base_vertex(
-                                    topology,
-                                    index_count as _,
-                                    index_type,
-                                    index_offset as i32,
-                                    instance_count as i32,
-                                    base_vertex,
-                                )
-                            }
+                    } else {
+                        // If we've gotten here, wgpu-core has already validated that this function exists via the DownlevelFlags::BASE_VERTEX feature.
+                        unsafe {
+                            gl.draw_elements_instanced_base_vertex(
+                                topology,
+                                index_count as _,
+                                index_type,
+                                index_offset as i32,
+                                instance_count as i32,
+                                base_vertex,
+                            )
                         }
                     }
                 }
diff --git a/third_party/rust/wgpu-hal/src/gles/wgl.rs b/third_party/rust/wgpu-hal/src/gles/wgl.rs
index 2564892969..aae70478b4 100644
--- a/third_party/rust/wgpu-hal/src/gles/wgl.rs
+++ b/third_party/rust/wgpu-hal/src/gles/wgl.rs
@@ -507,6 +507,8 @@ impl crate::Instance for Instance {
                 .supported_extensions()
                 .contains("GL_ARB_framebuffer_sRGB");
 
+        // In contrast to OpenGL ES, OpenGL requires explicitly enabling sRGB conversions,
+        // as otherwise the user has to do the sRGB conversion.
         if srgb_capable {
             unsafe { gl.enable(glow::FRAMEBUFFER_SRGB) };
         }
diff --git a/third_party/rust/wgpu-hal/src/lib.rs b/third_party/rust/wgpu-hal/src/lib.rs
index 79bd54e66e..d300ca30cc 100644
--- a/third_party/rust/wgpu-hal/src/lib.rs
+++ b/third_party/rust/wgpu-hal/src/lib.rs
@@ -1,17 +1,208 @@
-/*! This library describes the internal unsafe graphics abstraction API.
- *  It follows WebGPU for the most part, re-using wgpu-types,
- *  with the following deviations:
- *  - Fully unsafe: zero overhead, zero validation.
- *  - Compile-time backend selection via traits.
- *  - Objects are passed by references and returned by value. No IDs.
- *  - Mapping is persistent, with explicit synchronization.
- *  - Resource transitions are explicit.
- *  - All layouts are explicit. Binding model has compatibility.
+/*! A cross-platform unsafe graphics abstraction.
  *
- *  General design direction is to follow the majority by the following weights:
- *  - wgpu-core: 1.5
- *  - primary backends (Vulkan/Metal/DX12): 1.0 each
- *  - secondary backend (GLES): 0.5
+ * This crate defines a set of traits abstracting over modern graphics APIs,
+ * with implementations ("backends") for Vulkan, Metal, Direct3D, and GL.
+ *
+ * `wgpu-hal` is a spiritual successor to
+ * [gfx-hal](https://github.com/gfx-rs/gfx), but with reduced scope, and
+ * oriented towards WebGPU implementation goals. It has no overhead for
+ * validation or tracking, and the API translation overhead is kept to the bare
+ * minimum by the design of WebGPU. This API can be used for resource-demanding
+ * applications and engines.
+ *
+ * The `wgpu-hal` crate's main design choices:
+ *
+ * - Our traits are meant to be *portable*: proper use
+ *   should get equivalent results regardless of the backend.
+ *
+ * - Our traits' contracts are *unsafe*: implementations perform minimal
+ *   validation, if any, and incorrect use will often cause undefined behavior.
+ *   This allows us to minimize the overhead we impose over the underlying
+ *   graphics system. If you need safety, the [`wgpu-core`] crate provides a
+ *   safe API for driving `wgpu-hal`, implementing all necessary validation,
+ *   resource state tracking, and so on. (Note that `wgpu-core` is designed for
+ *   use via FFI; the [`wgpu`] crate provides more idiomatic Rust bindings for
+ *   `wgpu-core`.) Or, you can do your own validation.
+ *
+ * - In the same vein, returned errors *only cover cases the user can't
+ *   anticipate*, like running out of memory or losing the device. Any errors
+ *   that the user could reasonably anticipate are their responsibility to
+ *   avoid. For example, `wgpu-hal` returns no error for mapping a buffer that's
+ *   not mappable: as the buffer creator, the user should already know if they
+ *   can map it.
+ *
+ * - We use *static dispatch*. The traits are not
+ *   generally object-safe. You must select a specific backend type
+ *   like [`vulkan::Api`] or [`metal::Api`], and then use that
+ *   according to the main traits, or call backend-specific methods.
+ *
+ * - We use *idiomatic Rust parameter passing*,
+ *   taking objects by reference, returning them by value, and so on,
+ *   unlike `wgpu-core`, which refers to objects by ID.
+ *
+ * - We map buffer contents *persistently*. This means that the buffer
+ *   can remain mapped on the CPU while the GPU reads or writes to it.
+ *   You must explicitly indicate when data might need to be
+ *   transferred between CPU and GPU, if `wgpu-hal` indicates that the
+ *   mapping is not coherent (that is, automatically synchronized
+ *   between the two devices).
+ *
+ * - You must record *explicit barriers* between different usages of a
+ *   resource. For example, if a buffer is written to by a compute
+ *   shader, and then used as and index buffer to a draw call, you
+ *   must use [`CommandEncoder::transition_buffers`] between those two
+ *   operations.
+ *
+ * - Pipeline layouts are *explicitly specified* when setting bind
+ *   group. Incompatible layouts disturb groups bound at higher indices.
+ *
+ * - The API *accepts collections as iterators*, to avoid forcing the user to
+ *   store data in particular containers. The implementation doesn't guarantee
+ *   that any of the iterators are drained, unless stated otherwise by the
+ *   function documentation. For this reason, we recommend that iterators don't
+ *   do any mutating work.
+ *
+ * Unfortunately, `wgpu-hal`'s safety requirements are not fully documented.
+ * Ideally, all trait methods would have doc comments setting out the
+ * requirements users must meet to ensure correct and portable behavior. If you
+ * are aware of a specific requirement that a backend imposes that is not
+ * ensured by the traits' documented rules, please file an issue. Or, if you are
+ * a capable technical writer, please file a pull request!
+ *
+ * [`wgpu-core`]: https://crates.io/crates/wgpu-core
+ * [`wgpu`]: https://crates.io/crates/wgpu
+ * [`vulkan::Api`]: vulkan/struct.Api.html
+ * [`metal::Api`]: metal/struct.Api.html
+ *
+ * ## Primary backends
+ *
+ * The `wgpu-hal` crate has full-featured backends implemented on the following
+ * platform graphics APIs:
+ *
+ * - Vulkan, available on Linux, Android, and Windows, using the [`ash`] crate's
+ *   Vulkan bindings. It's also available on macOS, if you install [MoltenVK].
+ *
+ * - Metal on macOS, using the [`metal`] crate's bindings.
+ *
+ * - Direct3D 12 on Windows, using the [`d3d12`] crate's bindings.
+ *
+ * [`ash`]: https://crates.io/crates/ash
+ * [MoltenVK]: https://github.com/KhronosGroup/MoltenVK
+ * [`metal`]: https://crates.io/crates/metal
+ * [`d3d12`]: ahttps://crates.io/crates/d3d12
+ *
+ * ## Secondary backends
+ *
+ * The `wgpu-hal` crate has a partial implementation based on the following
+ * platform graphics API:
+ *
+ * - The GL backend is available anywhere OpenGL, OpenGL ES, or WebGL are
+ *   available. See the [`gles`] module documentation for details.
+ *
+ * [`gles`]: gles/index.html
+ *
+ * You can see what capabilities an adapter is missing by checking the
+ * [`DownlevelCapabilities`][tdc] in [`ExposedAdapter::capabilities`], available
+ * from [`Instance::enumerate_adapters`].
+ *
+ * The API is generally designed to fit the primary backends better than the
+ * secondary backends, so the latter may impose more overhead.
+ *
+ * [tdc]: wgt::DownlevelCapabilities
+ *
+ * ## Traits
+ *
+ * The `wgpu-hal` crate defines a handful of traits that together
+ * represent a cross-platform abstraction for modern GPU APIs.
+ *
+ * - The [`Api`] trait represents a `wgpu-hal` backend. It has no methods of its
+ *   own, only a collection of associated types.
+ *
+ * - [`Api::Instance`] implements the [`Instance`] trait. [`Instance::init`]
+ *   creates an instance value, which you can use to enumerate the adapters
+ *   available on the system. For example, [`vulkan::Api::Instance::init`][Ii]
+ *   returns an instance that can enumerate the Vulkan physical devices on your
+ *   system.
+ *
+ * - [`Api::Adapter`] implements the [`Adapter`] trait, representing a
+ *   particular device from a particular backend. For example, a Vulkan instance
+ *   might have a Lavapipe software adapter and a GPU-based adapter.
+ *
+ * - [`Api::Device`] implements the [`Device`] trait, representing an active
+ *   link to a device. You get a device value by calling [`Adapter::open`], and
+ *   then use it to create buffers, textures, shader modules, and so on.
+ *
+ * - [`Api::Queue`] implements the [`Queue`] trait, which you use to submit
+ *   command buffers to a given device.
+ *
+ * - [`Api::CommandEncoder`] implements the [`CommandEncoder`] trait, which you
+ *   use to build buffers of commands to submit to a queue. This has all the
+ *   methods for drawing and running compute shaders, which is presumably what
+ *   you're here for.
+ *
+ * - [`Api::Surface`] implements the [`Surface`] trait, which represents a
+ *   swapchain for presenting images on the screen, via interaction with the
+ *   system's window manager.
+ *
+ * The [`Api`] trait has various other associated types like [`Api::Buffer`] and
+ * [`Api::Texture`] that represent resources the rest of the interface can
+ * operate on, but these generally do not have their own traits.
+ *
+ * [Ii]: Instance::init
+ *
+ * ## Validation is the calling code's responsibility, not `wgpu-hal`'s
+ *
+ * As much as possible, `wgpu-hal` traits place the burden of validation,
+ * resource tracking, and state tracking on the caller, not on the trait
+ * implementations themselves. Anything which can reasonably be handled in
+ * backend-independent code should be. A `wgpu_hal` backend's sole obligation is
+ * to provide portable behavior, and report conditions that the calling code
+ * can't reasonably anticipate, like device loss or running out of memory.
+ *
+ * The `wgpu` crate collection is intended for use in security-sensitive
+ * applications, like web browsers, where the API is available to untrusted
+ * code. This means that `wgpu-core`'s validation is not simply a service to
+ * developers, to be provided opportunistically when the performance costs are
+ * acceptable and the necessary data is ready at hand. Rather, `wgpu-core`'s
+ * validation must be exhaustive, to ensure that even malicious content cannot
+ * provoke and exploit undefined behavior in the platform's graphics API.
+ *
+ * Because graphics APIs' requirements are complex, the only practical way for
+ * `wgpu` to provide exhaustive validation is to comprehensively track the
+ * lifetime and state of all the resources in the system. Implementing this
+ * separately for each backend is infeasible; effort would be better spent
+ * making the cross-platform validation in `wgpu-core` legible and trustworthy.
+ * Fortunately, the requirements are largely similar across the various
+ * platforms, so cross-platform validation is practical.
+ *
+ * Some backends have specific requirements that aren't practical to foist off
+ * on the `wgpu-hal` user. For example, properly managing macOS Objective-C or
+ * Microsoft COM reference counts is best handled by using appropriate pointer
+ * types within the backend.
+ *
+ * A desire for "defense in depth" may suggest performing additional validation
+ * in `wgpu-hal` when the opportunity arises, but this must be done with
+ * caution. Even experienced contributors infer the expectations their changes
+ * must meet by considering not just requirements made explicit in types, tests,
+ * assertions, and comments, but also those implicit in the surrounding code.
+ * When one sees validation or state-tracking code in `wgpu-hal`, it is tempting
+ * to conclude, "Oh, `wgpu-hal` checks for this, so `wgpu-core` needn't worry
+ * about it - that would be redundant!" The responsibility for exhaustive
+ * validation always rests with `wgpu-core`, regardless of what may or may not
+ * be checked in `wgpu-hal`.
+ *
+ * To this end, any "defense in depth" validation that does appear in `wgpu-hal`
+ * for requirements that `wgpu-core` should have enforced should report failure
+ * via the `unreachable!` macro, because problems detected at this stage always
+ * indicate a bug in `wgpu-core`.
+ *
+ * ## Debugging
+ *
+ * Most of the information on the wiki [Debugging wgpu Applications][wiki-debug]
+ * page still applies to this API, with the exception of API tracing/replay
+ * functionality, which is only available in `wgpu-core`.
+ *
+ * [wiki-debug]: https://github.com/gfx-rs/wgpu/wiki/Debugging-wgpu-Applications
  */
 
 #![cfg_attr(docsrs, feature(doc_cfg, doc_auto_cfg))]
@@ -198,6 +389,15 @@ pub trait Api: Clone + fmt::Debug + Sized {
 
     type Queue: Queue<A = Self>;
     type CommandEncoder: CommandEncoder<A = Self>;
+
+    /// This API's command buffer type.
+    ///
+    /// The only thing you can do with `CommandBuffer`s is build them
+    /// with a [`CommandEncoder`] and then pass them to
+    /// [`Queue::submit`] for execution, or destroy them by passing
+    /// them to [`CommandEncoder::reset_all`].
+    ///
+    /// [`CommandEncoder`]: Api::CommandEncoder
     type CommandBuffer: WasmNotSendSync + fmt::Debug;
 
     type Buffer: fmt::Debug + WasmNotSendSync + 'static;
@@ -206,6 +406,24 @@ pub trait Api: Clone + fmt::Debug + Sized {
     type TextureView: fmt::Debug + WasmNotSendSync;
     type Sampler: fmt::Debug + WasmNotSendSync;
     type QuerySet: fmt::Debug + WasmNotSendSync;
+
+    /// A value you can block on to wait for something to finish.
+    ///
+    /// A `Fence` holds a monotonically increasing [`FenceValue`]. You can call
+    /// [`Device::wait`] to block until a fence reaches or passes a value you
+    /// choose. [`Queue::submit`] can take a `Fence` and a [`FenceValue`] to
+    /// store in it when the submitted work is complete.
+    ///
+    /// Attempting to set a fence to a value less than its current value has no
+    /// effect.
+    ///
+    /// Waiting on a fence returns as soon as the fence reaches *or passes* the
+    /// requested value. This implies that, in order to reliably determine when
+    /// an operation has completed, operations must finish in order of
+    /// increasing fence values: if a higher-valued operation were to finish
+    /// before a lower-valued operation, then waiting for the fence to reach the
+    /// lower value could return before the lower-valued operation has actually
+    /// finished.
     type Fence: fmt::Debug + WasmNotSendSync;
 
     type BindGroupLayout: fmt::Debug + WasmNotSendSync;
@@ -405,7 +623,25 @@ pub trait Device: WasmNotSendSync {
         &self,
         fence: &<Self::A as Api>::Fence,
     ) -> Result<FenceValue, DeviceError>;
-    /// Calling wait with a lower value than the current fence value will immediately return.
+
+    /// Wait for `fence` to reach `value`.
+    ///
+    /// Operations like [`Queue::submit`] can accept a [`Fence`] and a
+    /// [`FenceValue`] to store in it, so you can use this `wait` function
+    /// to wait for a given queue submission to finish execution.
+    ///
+    /// The `value` argument must be a value that some actual operation you have
+    /// already presented to the device is going to store in `fence`. You cannot
+    /// wait for values yet to be submitted. (This restriction accommodates
+    /// implementations like the `vulkan` backend's [`FencePool`] that must
+    /// allocate a distinct synchronization object for each fence value one is
+    /// able to wait for.)
+    ///
+    /// Calling `wait` with a lower [`FenceValue`] than `fence`'s current value
+    /// returns immediately.
+    ///
+    /// [`Fence`]: Api::Fence
+    /// [`FencePool`]: vulkan/enum.Fence.html#variant.FencePool
     unsafe fn wait(
         &self,
         fence: &<Self::A as Api>::Fence,
@@ -437,14 +673,48 @@ pub trait Device: WasmNotSendSync {
 pub trait Queue: WasmNotSendSync {
     type A: Api;
 
-    /// Submits the command buffers for execution on GPU.
+    /// Submit `command_buffers` for execution on GPU.
+    ///
+    /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value`
+    /// when the operation is complete. See [`Fence`] for details.
+    ///
+    /// If two calls to `submit` on a single `Queue` occur in a particular order
+    /// (that is, they happen on the same thread, or on two threads that have
+    /// synchronized to establish an ordering), then the first submission's
+    /// commands all complete execution before any of the second submission's
+    /// commands begin. All results produced by one submission are visible to
+    /// the next.
+    ///
+    /// Within a submission, command buffers execute in the order in which they
+    /// appear in `command_buffers`. All results produced by one buffer are
+    /// visible to the next.
+    ///
+    /// If two calls to `submit` on a single `Queue` from different threads are
+    /// not synchronized to occur in a particular order, they must pass distinct
+    /// [`Fence`]s. As explained in the [`Fence`] documentation, waiting for
+    /// operations to complete is only trustworthy when operations finish in
+    /// order of increasing fence value, but submissions from different threads
+    /// cannot determine how to order the fence values if the submissions
+    /// themselves are unordered. If each thread uses a separate [`Fence`], this
+    /// problem does not arise.
     ///
     /// Valid usage:
-    /// - all of the command buffers were created from command pools
-    ///   that are associated with this queue.
-    /// - all of the command buffers had `CommandBuffer::finish()` called.
-    /// - all surface textures that the command buffers write to must be
-    ///   passed to the surface_textures argument.
+    ///
+    /// - All of the [`CommandBuffer`][cb]s were created from
+    ///   [`CommandEncoder`][ce]s that are associated with this queue.
+    ///
+    /// - All of those [`CommandBuffer`][cb]s must remain alive until
+    ///   the submitted commands have finished execution. (Since
+    ///   command buffers must not outlive their encoders, this
+    ///   implies that the encoders must remain alive as well.)
+    ///
+    /// - All of the [`SurfaceTexture`][st]s that the command buffers
+    ///   write to appear in the `surface_textures` argument.
+    ///
+    /// [`Fence`]: Api::Fence
+    /// [cb]: Api::CommandBuffer
+    /// [ce]: Api::CommandEncoder
+    /// [st]: Api::SurfaceTexture
     unsafe fn submit(
         &self,
         command_buffers: &[&<Self::A as Api>::CommandBuffer],
@@ -459,7 +729,12 @@ pub trait Queue: WasmNotSendSync {
     unsafe fn get_timestamp_period(&self) -> f32;
 }
 
-/// Encoder and allocation pool for `CommandBuffer`.
+/// Encoder and allocation pool for `CommandBuffer`s.
+///
+/// A `CommandEncoder` not only constructs `CommandBuffer`s but also
+/// acts as the allocation pool that owns the buffers' underlying
+/// storage. Thus, `CommandBuffer`s must not outlive the
+/// `CommandEncoder` that created them.
 ///
 /// The life cycle of a `CommandBuffer` is as follows:
 ///
@@ -472,14 +747,17 @@ pub trait Queue: WasmNotSendSync {
 ///
 /// - Call methods like `copy_buffer_to_buffer`, `begin_render_pass`,
 ///   etc. on a "recording" `CommandEncoder` to add commands to the
-///   list.
+///   list. (If an error occurs, you must call `discard_encoding`; see
+///   below.)
 ///
 /// - Call `end_encoding` on a recording `CommandEncoder` to close the
 ///   encoder and construct a fresh `CommandBuffer` consisting of the
 ///   list of commands recorded up to that point.
 ///
 /// - Call `discard_encoding` on a recording `CommandEncoder` to drop
-///   the commands recorded thus far and close the encoder.
+///   the commands recorded thus far and close the encoder. This is
+///   the only safe thing to do on a `CommandEncoder` if an error has
+///   occurred while recording commands.
 ///
 /// - Call `reset_all` on a closed `CommandEncoder`, passing all the
 ///   live `CommandBuffers` built from it. All the `CommandBuffer`s
@@ -497,6 +775,10 @@ pub trait Queue: WasmNotSendSync {
 ///   built it.
 ///
 /// - A `CommandEncoder` must not outlive its `Device`.
+///
+/// It is the user's responsibility to meet this requirements. This
+/// allows `CommandEncoder` implementations to keep their state
+/// tracking to a minimum.
 pub trait CommandEncoder: WasmNotSendSync + fmt::Debug {
     type A: Api;
 
@@ -509,13 +791,20 @@ pub trait CommandEncoder: WasmNotSendSync + fmt::Debug {
     /// This `CommandEncoder` must be in the "closed" state.
     unsafe fn begin_encoding(&mut self, label: Label) -> Result<(), DeviceError>;
 
-    /// Discard the command list under construction, if any.
+    /// Discard the command list under construction.
+    ///
+    /// If an error has occurred while recording commands, this
+    /// is the only safe thing to do with the encoder.
     ///
     /// This puts this `CommandEncoder` in the "closed" state.
     ///
     /// # Safety
     ///
     /// This `CommandEncoder` must be in the "recording" state.
+    ///
+    /// Callers must not assume that implementations of this
+    /// function are idempotent, and thus should not call it
+    /// multiple times in a row.
     unsafe fn discard_encoding(&mut self);
 
     /// Return a fresh [`CommandBuffer`] holding the recorded commands.
@@ -1318,6 +1607,13 @@ pub struct ProgrammableStage<'a, A: Api> {
     /// The name of the entry point in the compiled shader. There must be a function with this name
     ///  in the shader.
     pub entry_point: &'a str,
+    /// Pipeline constants
+    pub constants: &'a naga::back::PipelineConstants,
+    /// Whether workgroup scoped memory will be initialized with zero values for this stage.
+    ///
+    /// This is required by the WebGPU spec, but may have overhead which can be avoided
+    /// for cross-platform applications
+    pub zero_initialize_workgroup_memory: bool,
 }
 
 // Rust gets confused about the impl requirements for `A`
@@ -1326,6 +1622,8 @@ impl<A: Api> Clone for ProgrammableStage<'_, A> {
         Self {
             module: self.module,
             entry_point: self.entry_point,
+            constants: self.constants,
+            zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
         }
     }
 }
diff --git a/third_party/rust/wgpu-hal/src/metal/adapter.rs b/third_party/rust/wgpu-hal/src/metal/adapter.rs
index 6211896838..cddba472bd 100644
--- a/third_party/rust/wgpu-hal/src/metal/adapter.rs
+++ b/third_party/rust/wgpu-hal/src/metal/adapter.rs
@@ -562,7 +562,11 @@ impl super::PrivateCapabilities {
 
         Self {
             family_check,
-            msl_version: if os_is_xr || version.at_least((12, 0), (15, 0), os_is_mac) {
+            msl_version: if os_is_xr || version.at_least((14, 0), (17, 0), os_is_mac) {
+                MTLLanguageVersion::V3_1
+            } else if version.at_least((13, 0), (16, 0), os_is_mac) {
+                MTLLanguageVersion::V3_0
+            } else if version.at_least((12, 0), (15, 0), os_is_mac) {
                 MTLLanguageVersion::V2_4
             } else if version.at_least((11, 0), (14, 0), os_is_mac) {
                 MTLLanguageVersion::V2_3
@@ -809,6 +813,14 @@ impl super::PrivateCapabilities {
                 None
             },
             timestamp_query_support,
+            supports_simd_scoped_operations: family_check
+                && (device.supports_family(MTLGPUFamily::Metal3)
+                    || device.supports_family(MTLGPUFamily::Mac2)
+                    || device.supports_family(MTLGPUFamily::Apple7)),
+            // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf#page=5
+            int64: family_check
+                && (device.supports_family(MTLGPUFamily::Apple3)
+                    || device.supports_family(MTLGPUFamily::Metal3)),
         }
     }
 
@@ -882,7 +894,7 @@ impl super::PrivateCapabilities {
         }
         features.set(
             F::SHADER_INT64,
-            self.msl_version >= MTLLanguageVersion::V2_3,
+            self.int64 && self.msl_version >= MTLLanguageVersion::V2_3,
         );
 
         features.set(
@@ -894,6 +906,10 @@ impl super::PrivateCapabilities {
         features.set(F::RG11B10UFLOAT_RENDERABLE, self.format_rg11b10_all);
         features.set(F::SHADER_UNUSED_VERTEX_OUTPUT, true);
 
+        if self.supports_simd_scoped_operations {
+            features.insert(F::SUBGROUP | F::SUBGROUP_BARRIER);
+        }
+
         features
     }
 
@@ -948,6 +964,8 @@ impl super::PrivateCapabilities {
                 max_vertex_buffers: self.max_vertex_buffers,
                 max_vertex_attributes: 31,
                 max_vertex_buffer_array_stride: base.max_vertex_buffer_array_stride,
+                min_subgroup_size: 4,
+                max_subgroup_size: 64,
                 max_push_constant_size: 0x1000,
                 min_uniform_buffer_offset_alignment: self.buffer_alignment as u32,
                 min_storage_buffer_offset_alignment: self.buffer_alignment as u32,
diff --git a/third_party/rust/wgpu-hal/src/metal/conv.rs b/third_party/rust/wgpu-hal/src/metal/conv.rs
index 8f6439b50b..6ebabee1a6 100644
--- a/third_party/rust/wgpu-hal/src/metal/conv.rs
+++ b/third_party/rust/wgpu-hal/src/metal/conv.rs
@@ -222,6 +222,7 @@ pub fn map_vertex_format(format: wgt::VertexFormat) -> metal::MTLVertexFormat {
         Vf::Uint32x4 => UInt4,
         Vf::Sint32x4 => Int4,
         Vf::Float32x4 => Float4,
+        Vf::Unorm10_10_10_2 => UInt1010102Normalized,
         Vf::Float64 | Vf::Float64x2 | Vf::Float64x3 | Vf::Float64x4 => unimplemented!(),
     }
 }
diff --git a/third_party/rust/wgpu-hal/src/metal/device.rs b/third_party/rust/wgpu-hal/src/metal/device.rs
index 179429f5d7..2c8f5a2bfb 100644
--- a/third_party/rust/wgpu-hal/src/metal/device.rs
+++ b/third_party/rust/wgpu-hal/src/metal/device.rs
@@ -69,7 +69,13 @@ impl super::Device {
     ) -> Result<CompiledShader, crate::PipelineError> {
         let stage_bit = map_naga_stage(naga_stage);
 
-        let module = &stage.module.naga.module;
+        let (module, module_info) = naga::back::pipeline_constants::process_overrides(
+            &stage.module.naga.module,
+            &stage.module.naga.info,
+            stage.constants,
+        )
+        .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("MSL: {:?}", e)))?;
+
         let ep_resources = &layout.per_stage_map[naga_stage];
 
         let bounds_check_policy = if stage.module.runtime_checks {
@@ -88,6 +94,8 @@ impl super::Device {
                 metal::MTLLanguageVersion::V2_2 => (2, 2),
                 metal::MTLLanguageVersion::V2_3 => (2, 3),
                 metal::MTLLanguageVersion::V2_4 => (2, 4),
+                metal::MTLLanguageVersion::V3_0 => (3, 0),
+                metal::MTLLanguageVersion::V3_1 => (3, 1),
             },
             inline_samplers: Default::default(),
             spirv_cross_compatibility: false,
@@ -104,7 +112,7 @@ impl super::Device {
                 // TODO: support bounds checks on binding arrays
                 binding_array: naga::proc::BoundsCheckPolicy::Unchecked,
             },
-            zero_initialize_workgroup_memory: true,
+            zero_initialize_workgroup_memory: stage.zero_initialize_workgroup_memory,
         };
 
         let pipeline_options = naga::back::msl::PipelineOptions {
@@ -114,13 +122,9 @@ impl super::Device {
             },
         };
 
-        let (source, info) = naga::back::msl::write_string(
-            module,
-            &stage.module.naga.info,
-            &options,
-            &pipeline_options,
-        )
-        .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("MSL: {:?}", e)))?;
+        let (source, info) =
+            naga::back::msl::write_string(&module, &module_info, &options, &pipeline_options)
+                .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("MSL: {:?}", e)))?;
 
         log::debug!(
             "Naga generated shader for entry point '{}' and stage {:?}\n{}",
@@ -168,7 +172,7 @@ impl super::Device {
         })?;
 
         // collect sizes indices, immutable buffers, and work group memory sizes
-        let ep_info = &stage.module.naga.info.get_entry_point(ep_index);
+        let ep_info = &module_info.get_entry_point(ep_index);
         let mut wg_memory_sizes = Vec::new();
         let mut sized_bindings = Vec::new();
         let mut immutable_buffer_mask = 0;
diff --git a/third_party/rust/wgpu-hal/src/metal/mod.rs b/third_party/rust/wgpu-hal/src/metal/mod.rs
index 6aeafb0f86..7d547cfe3c 100644
--- a/third_party/rust/wgpu-hal/src/metal/mod.rs
+++ b/third_party/rust/wgpu-hal/src/metal/mod.rs
@@ -269,6 +269,8 @@ struct PrivateCapabilities {
     supports_shader_primitive_index: bool,
     has_unified_memory: Option<bool>,
     timestamp_query_support: TimestampQuerySupport,
+    supports_simd_scoped_operations: bool,
+    int64: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -649,7 +651,7 @@ struct BufferResource {
     /// Buffers with the [`wgt::BufferBindingType::Storage`] binding type can
     /// hold WGSL runtime-sized arrays. When one does, we must pass its size to
     /// shader entry points to implement bounds checks and WGSL's `arrayLength`
-    /// function. See [`device::CompiledShader::sized_bindings`] for details.
+    /// function. See `device::CompiledShader::sized_bindings` for details.
     ///
     /// [`Storage`]: wgt::BufferBindingType::Storage
     binding_size: Option<wgt::BufferSize>,
@@ -680,12 +682,12 @@ struct PipelineStageInfo {
 
     /// The buffer argument table index at which we pass runtime-sized arrays' buffer sizes.
     ///
-    /// See [`device::CompiledShader::sized_bindings`] for more details.
+    /// See `device::CompiledShader::sized_bindings` for more details.
     sizes_slot: Option<naga::back::msl::Slot>,
 
     /// Bindings of all WGSL `storage` globals that contain runtime-sized arrays.
     ///
-    /// See [`device::CompiledShader::sized_bindings`] for more details.
+    /// See `device::CompiledShader::sized_bindings` for more details.
     sized_bindings: Vec<naga::ResourceBinding>,
 }
 
@@ -801,7 +803,7 @@ struct CommandState {
     ///
     /// Specifically:
     ///
-    /// - The keys are ['ResourceBinding`] values (that is, the WGSL `@group`
+    /// - The keys are [`ResourceBinding`] values (that is, the WGSL `@group`
     ///   and `@binding` attributes) for `var<storage>` global variables in the
     ///   current module that contain runtime-sized arrays.
     ///
@@ -813,7 +815,7 @@ struct CommandState {
     /// of the buffers listed in [`stage_infos.S.sized_bindings`], which we must
     /// pass to the entry point.
     ///
-    /// See [`device::CompiledShader::sized_bindings`] for more details.
+    /// See `device::CompiledShader::sized_bindings` for more details.
     ///
     /// [`ResourceBinding`]: naga::ResourceBinding
     storage_buffer_length_map: rustc_hash::FxHashMap<naga::ResourceBinding, wgt::BufferSize>,
diff --git a/third_party/rust/wgpu-hal/src/vulkan/adapter.rs b/third_party/rust/wgpu-hal/src/vulkan/adapter.rs
index 2665463792..21219361f4 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/adapter.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/adapter.rs
@@ -35,6 +35,8 @@ fn indexing_features() -> wgt::Features {
 ///   [`PhysicalDeviceFeatures::from_extensions_and_requested_features`]
 ///   constructs an value of this type indicating which Vulkan features to
 ///   enable, based on the `wgpu_types::Features` requested.
+///
+/// [`Instance::expose_adapter`]: super::Instance::expose_adapter
 #[derive(Debug, Default)]
 pub struct PhysicalDeviceFeatures {
     /// Basic Vulkan 1.0 features.
@@ -86,6 +88,9 @@ pub struct PhysicalDeviceFeatures {
     ///
     /// However, we do populate this when creating a device if
     /// [`Features::RAY_TRACING_ACCELERATION_STRUCTURE`] is requested.
+    ///
+    /// [`Instance::expose_adapter`]: super::Instance::expose_adapter
+    /// [`Features::RAY_TRACING_ACCELERATION_STRUCTURE`]: wgt::Features::RAY_TRACING_ACCELERATION_STRUCTURE
     buffer_device_address: Option<vk::PhysicalDeviceBufferDeviceAddressFeaturesKHR>,
 
     /// Features provided by `VK_KHR_ray_query`,
@@ -95,12 +100,17 @@ pub struct PhysicalDeviceFeatures {
     /// this from `vkGetPhysicalDeviceFeatures2`.
     ///
     /// However, we do populate this when creating a device if ray tracing is requested.
+    ///
+    /// [`Instance::expose_adapter`]: super::Instance::expose_adapter
     ray_query: Option<vk::PhysicalDeviceRayQueryFeaturesKHR>,
 
     /// Features provided by `VK_KHR_zero_initialize_workgroup_memory`, promoted
     /// to Vulkan 1.3.
     zero_initialize_workgroup_memory:
         Option<vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures>,
+
+    /// Features provided by `VK_EXT_subgroup_size_control`, promoted to Vulkan 1.3.
+    subgroup_size_control: Option<vk::PhysicalDeviceSubgroupSizeControlFeatures>,
 }
 
 // This is safe because the structs have `p_next: *mut c_void`, which we null out/never read.
@@ -148,6 +158,9 @@ impl PhysicalDeviceFeatures {
         if let Some(ref mut feature) = self.ray_query {
             info = info.push_next(feature);
         }
+        if let Some(ref mut feature) = self.subgroup_size_control {
+            info = info.push_next(feature);
+        }
         info
     }
 
@@ -175,6 +188,7 @@ impl PhysicalDeviceFeatures {
     /// [`Features`]: wgt::Features
     /// [`DownlevelFlags`]: wgt::DownlevelFlags
     /// [`PrivateCapabilities`]: super::PrivateCapabilities
+    /// [`add_to_device_create_builder`]: PhysicalDeviceFeatures::add_to_device_create_builder
     /// [`DeviceCreateInfoBuilder`]: vk::DeviceCreateInfoBuilder
     /// [`Adapter::required_device_extensions`]: super::Adapter::required_device_extensions
     fn from_extensions_and_requested_features(
@@ -434,6 +448,17 @@ impl PhysicalDeviceFeatures {
             } else {
                 None
             },
+            subgroup_size_control: if device_api_version >= vk::API_VERSION_1_3
+                || enabled_extensions.contains(&vk::ExtSubgroupSizeControlFn::name())
+            {
+                Some(
+                    vk::PhysicalDeviceSubgroupSizeControlFeatures::builder()
+                        .subgroup_size_control(true)
+                        .build(),
+                )
+            } else {
+                None
+            },
         }
     }
 
@@ -442,6 +467,9 @@ impl PhysicalDeviceFeatures {
     /// Given `self`, together with the instance and physical device it was
     /// built from, and a `caps` also built from those, determine which wgpu
     /// features and downlevel flags the device can support.
+    ///
+    /// [`Features`]: wgt::Features
+    /// [`DownlevelFlags`]: wgt::DownlevelFlags
     fn to_wgpu(
         &self,
         instance: &ash::Instance,
@@ -638,6 +666,34 @@ impl PhysicalDeviceFeatures {
             );
         }
 
+        if let Some(ref subgroup) = caps.subgroup {
+            if (caps.device_api_version >= vk::API_VERSION_1_3
+                || caps.supports_extension(vk::ExtSubgroupSizeControlFn::name()))
+                && subgroup.supported_operations.contains(
+                    vk::SubgroupFeatureFlags::BASIC
+                        | vk::SubgroupFeatureFlags::VOTE
+                        | vk::SubgroupFeatureFlags::ARITHMETIC
+                        | vk::SubgroupFeatureFlags::BALLOT
+                        | vk::SubgroupFeatureFlags::SHUFFLE
+                        | vk::SubgroupFeatureFlags::SHUFFLE_RELATIVE,
+                )
+            {
+                features.set(
+                    F::SUBGROUP,
+                    subgroup
+                        .supported_stages
+                        .contains(vk::ShaderStageFlags::COMPUTE | vk::ShaderStageFlags::FRAGMENT),
+                );
+                features.set(
+                    F::SUBGROUP_VERTEX,
+                    subgroup
+                        .supported_stages
+                        .contains(vk::ShaderStageFlags::VERTEX),
+                );
+                features.insert(F::SUBGROUP_BARRIER);
+            }
+        }
+
         let supports_depth_format = |format| {
             supports_format(
                 instance,
@@ -773,6 +829,13 @@ pub struct PhysicalDeviceProperties {
     /// `VK_KHR_driver_properties` extension, promoted to Vulkan 1.2.
     driver: Option<vk::PhysicalDeviceDriverPropertiesKHR>,
 
+    /// Additional `vk::PhysicalDevice` properties from Vulkan 1.1.
+    subgroup: Option<vk::PhysicalDeviceSubgroupProperties>,
+
+    /// Additional `vk::PhysicalDevice` properties from the
+    /// `VK_EXT_subgroup_size_control` extension, promoted to Vulkan 1.3.
+    subgroup_size_control: Option<vk::PhysicalDeviceSubgroupSizeControlProperties>,
+
     /// The device API version.
     ///
     /// Which is the version of Vulkan supported for device-level functionality.
@@ -888,6 +951,11 @@ impl PhysicalDeviceProperties {
             if self.supports_extension(vk::ExtImageRobustnessFn::name()) {
                 extensions.push(vk::ExtImageRobustnessFn::name());
             }
+
+            // Require `VK_EXT_subgroup_size_control` if the associated feature was requested
+            if requested_features.contains(wgt::Features::SUBGROUP) {
+                extensions.push(vk::ExtSubgroupSizeControlFn::name());
+            }
         }
 
         // Optional `VK_KHR_swapchain_mutable_format`
@@ -987,6 +1055,14 @@ impl PhysicalDeviceProperties {
                 .min(crate::MAX_VERTEX_BUFFERS as u32),
             max_vertex_attributes: limits.max_vertex_input_attributes,
             max_vertex_buffer_array_stride: limits.max_vertex_input_binding_stride,
+            min_subgroup_size: self
+                .subgroup_size_control
+                .map(|subgroup_size| subgroup_size.min_subgroup_size)
+                .unwrap_or(0),
+            max_subgroup_size: self
+                .subgroup_size_control
+                .map(|subgroup_size| subgroup_size.max_subgroup_size)
+                .unwrap_or(0),
             max_push_constant_size: limits.max_push_constants_size,
             min_uniform_buffer_offset_alignment: limits.min_uniform_buffer_offset_alignment as u32,
             min_storage_buffer_offset_alignment: limits.min_storage_buffer_offset_alignment as u32,
@@ -1042,6 +1118,9 @@ impl super::InstanceShared {
                 let supports_driver_properties = capabilities.device_api_version
                     >= vk::API_VERSION_1_2
                     || capabilities.supports_extension(vk::KhrDriverPropertiesFn::name());
+                let supports_subgroup_size_control = capabilities.device_api_version
+                    >= vk::API_VERSION_1_3
+                    || capabilities.supports_extension(vk::ExtSubgroupSizeControlFn::name());
 
                 let supports_acceleration_structure =
                     capabilities.supports_extension(vk::KhrAccelerationStructureFn::name());
@@ -1075,6 +1154,20 @@ impl super::InstanceShared {
                     builder = builder.push_next(next);
                 }
 
+                if capabilities.device_api_version >= vk::API_VERSION_1_1 {
+                    let next = capabilities
+                        .subgroup
+                        .insert(vk::PhysicalDeviceSubgroupProperties::default());
+                    builder = builder.push_next(next);
+                }
+
+                if supports_subgroup_size_control {
+                    let next = capabilities
+                        .subgroup_size_control
+                        .insert(vk::PhysicalDeviceSubgroupSizeControlProperties::default());
+                    builder = builder.push_next(next);
+                }
+
                 let mut properties2 = builder.build();
                 unsafe {
                     get_device_properties.get_physical_device_properties2(phd, &mut properties2);
@@ -1190,6 +1283,16 @@ impl super::InstanceShared {
                 builder = builder.push_next(next);
             }
 
+            // `VK_EXT_subgroup_size_control` is promoted to 1.3
+            if capabilities.device_api_version >= vk::API_VERSION_1_3
+                || capabilities.supports_extension(vk::ExtSubgroupSizeControlFn::name())
+            {
+                let next = features
+                    .subgroup_size_control
+                    .insert(vk::PhysicalDeviceSubgroupSizeControlFeatures::default());
+                builder = builder.push_next(next);
+            }
+
             let mut features2 = builder.build();
             unsafe {
                 get_device_properties.get_physical_device_features2(phd, &mut features2);
@@ -1382,6 +1485,9 @@ impl super::Instance {
                 }),
             image_format_list: phd_capabilities.device_api_version >= vk::API_VERSION_1_2
                 || phd_capabilities.supports_extension(vk::KhrImageFormatListFn::name()),
+            subgroup_size_control: phd_features
+                .subgroup_size_control
+                .map_or(false, |ext| ext.subgroup_size_control == vk::TRUE),
         };
         let capabilities = crate::Capabilities {
             limits: phd_capabilities.to_wgpu_limits(),
@@ -1581,6 +1687,15 @@ impl super::Adapter {
                 capabilities.push(spv::Capability::Geometry);
             }
 
+            if features.intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX) {
+                capabilities.push(spv::Capability::GroupNonUniform);
+                capabilities.push(spv::Capability::GroupNonUniformVote);
+                capabilities.push(spv::Capability::GroupNonUniformArithmetic);
+                capabilities.push(spv::Capability::GroupNonUniformBallot);
+                capabilities.push(spv::Capability::GroupNonUniformShuffle);
+                capabilities.push(spv::Capability::GroupNonUniformShuffleRelative);
+            }
+
             if features.intersects(
                 wgt::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING
                     | wgt::Features::UNIFORM_BUFFER_AND_STORAGE_TEXTURE_ARRAY_NON_UNIFORM_INDEXING,
@@ -1616,7 +1731,13 @@ impl super::Adapter {
                 true, // could check `super::Workarounds::SEPARATE_ENTRY_POINTS`
             );
             spv::Options {
-                lang_version: (1, 0),
+                lang_version: if features
+                    .intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX)
+                {
+                    (1, 3)
+                } else {
+                    (1, 0)
+                },
                 flags,
                 capabilities: Some(capabilities.iter().cloned().collect()),
                 bounds_check_policies: naga::proc::BoundsCheckPolicies {
diff --git a/third_party/rust/wgpu-hal/src/vulkan/command.rs b/third_party/rust/wgpu-hal/src/vulkan/command.rs
index 43a2471954..ceb44dfbe6 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/command.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/command.rs
@@ -104,6 +104,11 @@ impl crate::CommandEncoder for super::CommandEncoder {
     }
 
     unsafe fn discard_encoding(&mut self) {
+        // Safe use requires this is not called in the "closed" state, so the buffer
+        // shouldn't be null. Assert this to make sure we're not pushing null
+        // buffers to the discard pile.
+        assert_ne!(self.active, vk::CommandBuffer::null());
+
         self.discarded.push(self.active);
         self.active = vk::CommandBuffer::null();
     }
diff --git a/third_party/rust/wgpu-hal/src/vulkan/conv.rs b/third_party/rust/wgpu-hal/src/vulkan/conv.rs
index 8202c93aa3..fe284f32a9 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/conv.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/conv.rs
@@ -399,6 +399,7 @@ pub fn map_vertex_format(vertex_format: wgt::VertexFormat) -> vk::Format {
         Vf::Float64x2 => vk::Format::R64G64_SFLOAT,
         Vf::Float64x3 => vk::Format::R64G64B64_SFLOAT,
         Vf::Float64x4 => vk::Format::R64G64B64A64_SFLOAT,
+        Vf::Unorm10_10_10_2 => vk::Format::A2B10G10R10_UNORM_PACK32,
     }
 }
 
diff --git a/third_party/rust/wgpu-hal/src/vulkan/device.rs b/third_party/rust/wgpu-hal/src/vulkan/device.rs
index 70028cc700..ec392533a0 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/device.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/device.rs
@@ -2,6 +2,7 @@ use super::conv;
 
 use arrayvec::ArrayVec;
 use ash::{extensions::khr, vk};
+use naga::back::spv::ZeroInitializeWorkgroupMemoryMode;
 use parking_lot::Mutex;
 
 use std::{
@@ -737,7 +738,8 @@ impl super::Device {
                 };
                 let needs_temp_options = !runtime_checks
                     || !binding_map.is_empty()
-                    || naga_shader.debug_source.is_some();
+                    || naga_shader.debug_source.is_some()
+                    || !stage.zero_initialize_workgroup_memory;
                 let mut temp_options;
                 let options = if needs_temp_options {
                     temp_options = self.naga_options.clone();
@@ -760,27 +762,40 @@ impl super::Device {
                             file_name: debug.file_name.as_ref().as_ref(),
                         })
                     }
+                    if !stage.zero_initialize_workgroup_memory {
+                        temp_options.zero_initialize_workgroup_memory =
+                            ZeroInitializeWorkgroupMemoryMode::None;
+                    }
 
                     &temp_options
                 } else {
                     &self.naga_options
                 };
+
+                let (module, info) = naga::back::pipeline_constants::process_overrides(
+                    &naga_shader.module,
+                    &naga_shader.info,
+                    stage.constants,
+                )
+                .map_err(|e| crate::PipelineError::Linkage(stage_flags, format!("{e}")))?;
+
                 let spv = {
                     profiling::scope!("naga::spv::write_vec");
-                    naga::back::spv::write_vec(
-                        &naga_shader.module,
-                        &naga_shader.info,
-                        options,
-                        Some(&pipeline_options),
-                    )
+                    naga::back::spv::write_vec(&module, &info, options, Some(&pipeline_options))
                 }
                 .map_err(|e| crate::PipelineError::Linkage(stage_flags, format!("{e}")))?;
                 self.create_shader_module_impl(&spv)?
             }
         };
 
+        let mut flags = vk::PipelineShaderStageCreateFlags::empty();
+        if self.shared.private_caps.subgroup_size_control {
+            flags |= vk::PipelineShaderStageCreateFlags::ALLOW_VARYING_SUBGROUP_SIZE
+        }
+
         let entry_point = CString::new(stage.entry_point).unwrap();
         let create_info = vk::PipelineShaderStageCreateInfo::builder()
+            .flags(flags)
             .stage(conv::map_shader_stage(stage_flags))
             .module(vk_module)
             .name(&entry_point)
@@ -1587,6 +1602,7 @@ impl crate::Device for super::Device {
                     .shared
                     .workarounds
                     .contains(super::Workarounds::SEPARATE_ENTRY_POINTS)
+                    || !naga_shader.module.overrides.is_empty()
                 {
                     return Ok(super::ShaderModule::Intermediate {
                         naga_shader,
diff --git a/third_party/rust/wgpu-hal/src/vulkan/mod.rs b/third_party/rust/wgpu-hal/src/vulkan/mod.rs
index 0cd385045c..d1ea82772e 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/mod.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/mod.rs
@@ -238,6 +238,7 @@ struct PrivateCapabilities {
     robust_image_access2: bool,
     zero_initialize_workgroup_memory: bool,
     image_format_list: bool,
+    subgroup_size_control: bool,
 }
 
 bitflags::bitflags!(
@@ -413,6 +414,15 @@ pub struct TextureView {
     attachment: FramebufferAttachment,
 }
 
+impl TextureView {
+    /// # Safety
+    ///
+    /// - The image view handle must not be manually destroyed
+    pub unsafe fn raw_handle(&self) -> vk::ImageView {
+        self.raw
+    }
+}
+
 #[derive(Debug)]
 pub struct Sampler {
     raw: vk::Sampler,
@@ -438,6 +448,7 @@ pub struct BindGroup {
     set: gpu_descriptor::DescriptorSet<vk::DescriptorSet>,
 }
 
+/// Miscellaneous allocation recycling pool for `CommandAllocator`.
 #[derive(Default)]
 struct Temp {
     marker: Vec<u8>,
@@ -467,11 +478,31 @@ impl Temp {
 pub struct CommandEncoder {
     raw: vk::CommandPool,
     device: Arc<DeviceShared>,
+
+    /// The current command buffer, if `self` is in the ["recording"]
+    /// state.
+    ///
+    /// ["recording"]: crate::CommandEncoder
+    ///
+    /// If non-`null`, the buffer is in the Vulkan "recording" state.
     active: vk::CommandBuffer,
+
+    /// What kind of pass we are currently within: compute or render.
     bind_point: vk::PipelineBindPoint,
+
+    /// Allocation recycling pool for this encoder.
     temp: Temp,
+
+    /// A pool of available command buffers.
+    ///
+    /// These are all in the Vulkan "initial" state.
     free: Vec<vk::CommandBuffer>,
+
+    /// A pool of discarded command buffers.
+    ///
+    /// These could be in any Vulkan state except "pending".
     discarded: Vec<vk::CommandBuffer>,
+
     /// If this is true, the active renderpass enabled a debug span,
     /// and needs to be disabled on renderpass close.
     rpass_debug_marker_active: bool,
@@ -481,6 +512,15 @@ pub struct CommandEncoder {
     end_of_pass_timer_query: Option<(vk::QueryPool, u32)>,
 }
 
+impl CommandEncoder {
+    /// # Safety
+    ///
+    /// - The command buffer handle must not be manually destroyed
+    pub unsafe fn raw_handle(&self) -> vk::CommandBuffer {
+        self.active
+    }
+}
+
 impl fmt::Debug for CommandEncoder {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("CommandEncoder")
@@ -519,9 +559,47 @@ pub struct QuerySet {
     raw: vk::QueryPool,
 }
 
+/// The [`Api::Fence`] type for [`vulkan::Api`].
+///
+/// This is an `enum` because there are two possible implementations of
+/// `wgpu-hal` fences on Vulkan: Vulkan fences, which work on any version of
+/// Vulkan, and Vulkan timeline semaphores, which are easier and cheaper but
+/// require non-1.0 features.
+///
+/// [`Device::create_fence`] returns a [`TimelineSemaphore`] if
+/// [`VK_KHR_timeline_semaphore`] is available and enabled, and a [`FencePool`]
+/// otherwise.
+///
+/// [`Api::Fence`]: crate::Api::Fence
+/// [`vulkan::Api`]: Api
+/// [`Device::create_fence`]: crate::Device::create_fence
+/// [`TimelineSemaphore`]: Fence::TimelineSemaphore
+/// [`VK_KHR_timeline_semaphore`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VK_KHR_timeline_semaphore
+/// [`FencePool`]: Fence::FencePool
 #[derive(Debug)]
 pub enum Fence {
+    /// A Vulkan [timeline semaphore].
+    ///
+    /// These are simpler to use than Vulkan fences, since timeline semaphores
+    /// work exactly the way [`wpgu_hal::Api::Fence`] is specified to work.
+    ///
+    /// [timeline semaphore]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-semaphores
+    /// [`wpgu_hal::Api::Fence`]: crate::Api::Fence
     TimelineSemaphore(vk::Semaphore),
+
+    /// A collection of Vulkan [fence]s, each associated with a [`FenceValue`].
+    ///
+    /// The effective [`FenceValue`] of this variant is the greater of
+    /// `last_completed` and the maximum value associated with a signalled fence
+    /// in `active`.
+    ///
+    /// Fences are available in all versions of Vulkan, but since they only have
+    /// two states, "signaled" and "unsignaled", we need to use a separate fence
+    /// for each queue submission we might want to wait for, and remember which
+    /// [`FenceValue`] each one represents.
+    ///
+    /// [fence]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-fences
+    /// [`FenceValue`]: crate::FenceValue
     FencePool {
         last_completed: crate::FenceValue,
         /// The pending fence values have to be ascending.
@@ -531,21 +609,32 @@ pub enum Fence {
 }
 
 impl Fence {
+    /// Return the highest [`FenceValue`] among the signalled fences in `active`.
+    ///
+    /// As an optimization, assume that we already know that the fence has
+    /// reached `last_completed`, and don't bother checking fences whose values
+    /// are less than that: those fences remain in the `active` array only
+    /// because we haven't called `maintain` yet to clean them up.
+    ///
+    /// [`FenceValue`]: crate::FenceValue
     fn check_active(
         device: &ash::Device,
-        mut max_value: crate::FenceValue,
+        mut last_completed: crate::FenceValue,
         active: &[(crate::FenceValue, vk::Fence)],
     ) -> Result<crate::FenceValue, crate::DeviceError> {
         for &(value, raw) in active.iter() {
             unsafe {
-                if value > max_value && device.get_fence_status(raw)? {
-                    max_value = value;
+                if value > last_completed && device.get_fence_status(raw)? {
+                    last_completed = value;
                 }
             }
         }
-        Ok(max_value)
+        Ok(last_completed)
     }
 
+    /// Return the highest signalled [`FenceValue`] for `self`.
+    ///
+    /// [`FenceValue`]: crate::FenceValue
     fn get_latest(
         &self,
         device: &ash::Device,
@@ -566,6 +655,18 @@ impl Fence {
         }
     }
 
+    /// Trim the internal state of this [`Fence`].
+    ///
+    /// This function has no externally visible effect, but you should call it
+    /// periodically to keep this fence's resource consumption under control.
+    ///
+    /// For fences using the [`FencePool`] implementation, this function
+    /// recycles fences that have been signaled. If you don't call this,
+    /// [`Queue::submit`] will just keep allocating a new Vulkan fence every
+    /// time it's called.
+    ///
+    /// [`FencePool`]: Fence::FencePool
+    /// [`Queue::submit`]: crate::Queue::submit
     fn maintain(&mut self, device: &ash::Device) -> Result<(), crate::DeviceError> {
         match *self {
             Self::TimelineSemaphore(_) => {}