5 files changed, 256 insertions, 12 deletions
diff --git a/third_party/rust/wgpu-hal/src/vulkan/adapter.rs b/third_party/rust/wgpu-hal/src/vulkan/adapter.rs
index 2665463792..21219361f4 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/adapter.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/adapter.rs
@@ -35,6 +35,8 @@ fn indexing_features() -> wgt::Features {
 ///   [`PhysicalDeviceFeatures::from_extensions_and_requested_features`]
 ///   constructs an value of this type indicating which Vulkan features to
 ///   enable, based on the `wgpu_types::Features` requested.
+///
+/// [`Instance::expose_adapter`]: super::Instance::expose_adapter
 #[derive(Debug, Default)]
 pub struct PhysicalDeviceFeatures {
     /// Basic Vulkan 1.0 features.
@@ -86,6 +88,9 @@ pub struct PhysicalDeviceFeatures {
     ///
     /// However, we do populate this when creating a device if
     /// [`Features::RAY_TRACING_ACCELERATION_STRUCTURE`] is requested.
+    ///
+    /// [`Instance::expose_adapter`]: super::Instance::expose_adapter
+    /// [`Features::RAY_TRACING_ACCELERATION_STRUCTURE`]: wgt::Features::RAY_TRACING_ACCELERATION_STRUCTURE
     buffer_device_address: Option<vk::PhysicalDeviceBufferDeviceAddressFeaturesKHR>,
 
     /// Features provided by `VK_KHR_ray_query`,
@@ -95,12 +100,17 @@ pub struct PhysicalDeviceFeatures {
     /// this from `vkGetPhysicalDeviceFeatures2`.
     ///
     /// However, we do populate this when creating a device if ray tracing is requested.
+    ///
+    /// [`Instance::expose_adapter`]: super::Instance::expose_adapter
     ray_query: Option<vk::PhysicalDeviceRayQueryFeaturesKHR>,
 
     /// Features provided by `VK_KHR_zero_initialize_workgroup_memory`, promoted
     /// to Vulkan 1.3.
     zero_initialize_workgroup_memory:
         Option<vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures>,
+
+    /// Features provided by `VK_EXT_subgroup_size_control`, promoted to Vulkan 1.3.
+    subgroup_size_control: Option<vk::PhysicalDeviceSubgroupSizeControlFeatures>,
 }
 
 // This is safe because the structs have `p_next: *mut c_void`, which we null out/never read.
@@ -148,6 +158,9 @@ impl PhysicalDeviceFeatures {
         if let Some(ref mut feature) = self.ray_query {
             info = info.push_next(feature);
         }
+        if let Some(ref mut feature) = self.subgroup_size_control {
+            info = info.push_next(feature);
+        }
         info
     }
 
@@ -175,6 +188,7 @@ impl PhysicalDeviceFeatures {
     /// [`Features`]: wgt::Features
     /// [`DownlevelFlags`]: wgt::DownlevelFlags
     /// [`PrivateCapabilities`]: super::PrivateCapabilities
+    /// [`add_to_device_create_builder`]: PhysicalDeviceFeatures::add_to_device_create_builder
     /// [`DeviceCreateInfoBuilder`]: vk::DeviceCreateInfoBuilder
     /// [`Adapter::required_device_extensions`]: super::Adapter::required_device_extensions
     fn from_extensions_and_requested_features(
@@ -434,6 +448,17 @@ impl PhysicalDeviceFeatures {
             } else {
                 None
             },
+            subgroup_size_control: if device_api_version >= vk::API_VERSION_1_3
+                || enabled_extensions.contains(&vk::ExtSubgroupSizeControlFn::name())
+            {
+                Some(
+                    vk::PhysicalDeviceSubgroupSizeControlFeatures::builder()
+                        .subgroup_size_control(true)
+                        .build(),
+                )
+            } else {
+                None
+            },
         }
     }
 
@@ -442,6 +467,9 @@ impl PhysicalDeviceFeatures {
     /// Given `self`, together with the instance and physical device it was
     /// built from, and a `caps` also built from those, determine which wgpu
     /// features and downlevel flags the device can support.
+    ///
+    /// [`Features`]: wgt::Features
+    /// [`DownlevelFlags`]: wgt::DownlevelFlags
     fn to_wgpu(
         &self,
         instance: &ash::Instance,
@@ -638,6 +666,34 @@ impl PhysicalDeviceFeatures {
             );
         }
 
+        if let Some(ref subgroup) = caps.subgroup {
+            if (caps.device_api_version >= vk::API_VERSION_1_3
+                || caps.supports_extension(vk::ExtSubgroupSizeControlFn::name()))
+                && subgroup.supported_operations.contains(
+                    vk::SubgroupFeatureFlags::BASIC
+                        | vk::SubgroupFeatureFlags::VOTE
+                        | vk::SubgroupFeatureFlags::ARITHMETIC
+                        | vk::SubgroupFeatureFlags::BALLOT
+                        | vk::SubgroupFeatureFlags::SHUFFLE
+                        | vk::SubgroupFeatureFlags::SHUFFLE_RELATIVE,
+                )
+            {
+                features.set(
+                    F::SUBGROUP,
+                    subgroup
+                        .supported_stages
+                        .contains(vk::ShaderStageFlags::COMPUTE | vk::ShaderStageFlags::FRAGMENT),
+                );
+                features.set(
+                    F::SUBGROUP_VERTEX,
+                    subgroup
+                        .supported_stages
+                        .contains(vk::ShaderStageFlags::VERTEX),
+                );
+                features.insert(F::SUBGROUP_BARRIER);
+            }
+        }
+
         let supports_depth_format = |format| {
             supports_format(
                 instance,
@@ -773,6 +829,13 @@ pub struct PhysicalDeviceProperties {
     /// `VK_KHR_driver_properties` extension, promoted to Vulkan 1.2.
     driver: Option<vk::PhysicalDeviceDriverPropertiesKHR>,
 
+    /// Additional `vk::PhysicalDevice` properties from Vulkan 1.1.
+    subgroup: Option<vk::PhysicalDeviceSubgroupProperties>,
+
+    /// Additional `vk::PhysicalDevice` properties from the
+    /// `VK_EXT_subgroup_size_control` extension, promoted to Vulkan 1.3.
+    subgroup_size_control: Option<vk::PhysicalDeviceSubgroupSizeControlProperties>,
+
     /// The device API version.
     ///
     /// Which is the version of Vulkan supported for device-level functionality.
@@ -888,6 +951,11 @@ impl PhysicalDeviceProperties {
             if self.supports_extension(vk::ExtImageRobustnessFn::name()) {
                 extensions.push(vk::ExtImageRobustnessFn::name());
             }
+
+            // Require `VK_EXT_subgroup_size_control` if the associated feature was requested
+            if requested_features.contains(wgt::Features::SUBGROUP) {
+                extensions.push(vk::ExtSubgroupSizeControlFn::name());
+            }
         }
 
         // Optional `VK_KHR_swapchain_mutable_format`
@@ -987,6 +1055,14 @@ impl PhysicalDeviceProperties {
                 .min(crate::MAX_VERTEX_BUFFERS as u32),
             max_vertex_attributes: limits.max_vertex_input_attributes,
             max_vertex_buffer_array_stride: limits.max_vertex_input_binding_stride,
+            min_subgroup_size: self
+                .subgroup_size_control
+                .map(|subgroup_size| subgroup_size.min_subgroup_size)
+                .unwrap_or(0),
+            max_subgroup_size: self
+                .subgroup_size_control
+                .map(|subgroup_size| subgroup_size.max_subgroup_size)
+                .unwrap_or(0),
             max_push_constant_size: limits.max_push_constants_size,
             min_uniform_buffer_offset_alignment: limits.min_uniform_buffer_offset_alignment as u32,
             min_storage_buffer_offset_alignment: limits.min_storage_buffer_offset_alignment as u32,
@@ -1042,6 +1118,9 @@ impl super::InstanceShared {
                 let supports_driver_properties = capabilities.device_api_version
                     >= vk::API_VERSION_1_2
                     || capabilities.supports_extension(vk::KhrDriverPropertiesFn::name());
+                let supports_subgroup_size_control = capabilities.device_api_version
+                    >= vk::API_VERSION_1_3
+                    || capabilities.supports_extension(vk::ExtSubgroupSizeControlFn::name());
 
                 let supports_acceleration_structure =
                     capabilities.supports_extension(vk::KhrAccelerationStructureFn::name());
@@ -1075,6 +1154,20 @@ impl super::InstanceShared {
                     builder = builder.push_next(next);
                 }
 
+                if capabilities.device_api_version >= vk::API_VERSION_1_1 {
+                    let next = capabilities
+                        .subgroup
+                        .insert(vk::PhysicalDeviceSubgroupProperties::default());
+                    builder = builder.push_next(next);
+                }
+
+                if supports_subgroup_size_control {
+                    let next = capabilities
+                        .subgroup_size_control
+                        .insert(vk::PhysicalDeviceSubgroupSizeControlProperties::default());
+                    builder = builder.push_next(next);
+                }
+
                 let mut properties2 = builder.build();
                 unsafe {
                     get_device_properties.get_physical_device_properties2(phd, &mut properties2);
@@ -1190,6 +1283,16 @@ impl super::InstanceShared {
                 builder = builder.push_next(next);
             }
 
+            // `VK_EXT_subgroup_size_control` is promoted to 1.3
+            if capabilities.device_api_version >= vk::API_VERSION_1_3
+                || capabilities.supports_extension(vk::ExtSubgroupSizeControlFn::name())
+            {
+                let next = features
+                    .subgroup_size_control
+                    .insert(vk::PhysicalDeviceSubgroupSizeControlFeatures::default());
+                builder = builder.push_next(next);
+            }
+
             let mut features2 = builder.build();
             unsafe {
                 get_device_properties.get_physical_device_features2(phd, &mut features2);
@@ -1382,6 +1485,9 @@ impl super::Instance {
                 }),
             image_format_list: phd_capabilities.device_api_version >= vk::API_VERSION_1_2
                 || phd_capabilities.supports_extension(vk::KhrImageFormatListFn::name()),
+            subgroup_size_control: phd_features
+                .subgroup_size_control
+                .map_or(false, |ext| ext.subgroup_size_control == vk::TRUE),
         };
         let capabilities = crate::Capabilities {
             limits: phd_capabilities.to_wgpu_limits(),
@@ -1581,6 +1687,15 @@ impl super::Adapter {
                 capabilities.push(spv::Capability::Geometry);
             }
 
+            if features.intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX) {
+                capabilities.push(spv::Capability::GroupNonUniform);
+                capabilities.push(spv::Capability::GroupNonUniformVote);
+                capabilities.push(spv::Capability::GroupNonUniformArithmetic);
+                capabilities.push(spv::Capability::GroupNonUniformBallot);
+                capabilities.push(spv::Capability::GroupNonUniformShuffle);
+                capabilities.push(spv::Capability::GroupNonUniformShuffleRelative);
+            }
+
             if features.intersects(
                 wgt::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING
                     | wgt::Features::UNIFORM_BUFFER_AND_STORAGE_TEXTURE_ARRAY_NON_UNIFORM_INDEXING,
@@ -1616,7 +1731,13 @@ impl super::Adapter {
                 true, // could check `super::Workarounds::SEPARATE_ENTRY_POINTS`
             );
             spv::Options {
-                lang_version: (1, 0),
+                lang_version: if features
+                    .intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX)
+                {
+                    (1, 3)
+                } else {
+                    (1, 0)
+                },
                 flags,
                 capabilities: Some(capabilities.iter().cloned().collect()),
                 bounds_check_policies: naga::proc::BoundsCheckPolicies {
diff --git a/third_party/rust/wgpu-hal/src/vulkan/command.rs b/third_party/rust/wgpu-hal/src/vulkan/command.rs
index 43a2471954..ceb44dfbe6 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/command.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/command.rs
@@ -104,6 +104,11 @@ impl crate::CommandEncoder for super::CommandEncoder {
     }
 
     unsafe fn discard_encoding(&mut self) {
+        // Safe use requires this is not called in the "closed" state, so the buffer
+        // shouldn't be null. Assert this to make sure we're not pushing null
+        // buffers to the discard pile.
+        assert_ne!(self.active, vk::CommandBuffer::null());
+
         self.discarded.push(self.active);
         self.active = vk::CommandBuffer::null();
     }
diff --git a/third_party/rust/wgpu-hal/src/vulkan/conv.rs b/third_party/rust/wgpu-hal/src/vulkan/conv.rs
index 8202c93aa3..fe284f32a9 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/conv.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/conv.rs
@@ -399,6 +399,7 @@ pub fn map_vertex_format(vertex_format: wgt::VertexFormat) -> vk::Format {
         Vf::Float64x2 => vk::Format::R64G64_SFLOAT,
         Vf::Float64x3 => vk::Format::R64G64B64_SFLOAT,
         Vf::Float64x4 => vk::Format::R64G64B64A64_SFLOAT,
+        Vf::Unorm10_10_10_2 => vk::Format::A2B10G10R10_UNORM_PACK32,
     }
 }
 
diff --git a/third_party/rust/wgpu-hal/src/vulkan/device.rs b/third_party/rust/wgpu-hal/src/vulkan/device.rs
index 70028cc700..ec392533a0 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/device.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/device.rs
@@ -2,6 +2,7 @@ use super::conv;
 
 use arrayvec::ArrayVec;
 use ash::{extensions::khr, vk};
+use naga::back::spv::ZeroInitializeWorkgroupMemoryMode;
 use parking_lot::Mutex;
 
 use std::{
@@ -737,7 +738,8 @@ impl super::Device {
                 };
                 let needs_temp_options = !runtime_checks
                     || !binding_map.is_empty()
-                    || naga_shader.debug_source.is_some();
+                    || naga_shader.debug_source.is_some()
+                    || !stage.zero_initialize_workgroup_memory;
                 let mut temp_options;
                 let options = if needs_temp_options {
                     temp_options = self.naga_options.clone();
@@ -760,27 +762,40 @@ impl super::Device {
                             file_name: debug.file_name.as_ref().as_ref(),
                         })
                     }
+                    if !stage.zero_initialize_workgroup_memory {
+                        temp_options.zero_initialize_workgroup_memory =
+                            ZeroInitializeWorkgroupMemoryMode::None;
+                    }
 
                     &temp_options
                 } else {
                     &self.naga_options
                 };
+
+                let (module, info) = naga::back::pipeline_constants::process_overrides(
+                    &naga_shader.module,
+                    &naga_shader.info,
+                    stage.constants,
+                )
+                .map_err(|e| crate::PipelineError::Linkage(stage_flags, format!("{e}")))?;
+
                 let spv = {
                     profiling::scope!("naga::spv::write_vec");
-                    naga::back::spv::write_vec(
-                        &naga_shader.module,
-                        &naga_shader.info,
-                        options,
-                        Some(&pipeline_options),
-                    )
+                    naga::back::spv::write_vec(&module, &info, options, Some(&pipeline_options))
                 }
                 .map_err(|e| crate::PipelineError::Linkage(stage_flags, format!("{e}")))?;
                 self.create_shader_module_impl(&spv)?
             }
         };
 
+        let mut flags = vk::PipelineShaderStageCreateFlags::empty();
+        if self.shared.private_caps.subgroup_size_control {
+            flags |= vk::PipelineShaderStageCreateFlags::ALLOW_VARYING_SUBGROUP_SIZE
+        }
+
         let entry_point = CString::new(stage.entry_point).unwrap();
         let create_info = vk::PipelineShaderStageCreateInfo::builder()
+            .flags(flags)
             .stage(conv::map_shader_stage(stage_flags))
             .module(vk_module)
             .name(&entry_point)
@@ -1587,6 +1602,7 @@ impl crate::Device for super::Device {
                     .shared
                     .workarounds
                     .contains(super::Workarounds::SEPARATE_ENTRY_POINTS)
+                    || !naga_shader.module.overrides.is_empty()
                 {
                     return Ok(super::ShaderModule::Intermediate {
                         naga_shader,
diff --git a/third_party/rust/wgpu-hal/src/vulkan/mod.rs b/third_party/rust/wgpu-hal/src/vulkan/mod.rs
index 0cd385045c..d1ea82772e 100644
--- a/third_party/rust/wgpu-hal/src/vulkan/mod.rs
+++ b/third_party/rust/wgpu-hal/src/vulkan/mod.rs
@@ -238,6 +238,7 @@ struct PrivateCapabilities {
     robust_image_access2: bool,
     zero_initialize_workgroup_memory: bool,
     image_format_list: bool,
+    subgroup_size_control: bool,
 }
 
 bitflags::bitflags!(
@@ -413,6 +414,15 @@ pub struct TextureView {
     attachment: FramebufferAttachment,
 }
 
+impl TextureView {
+    /// # Safety
+    ///
+    /// - The image view handle must not be manually destroyed
+    pub unsafe fn raw_handle(&self) -> vk::ImageView {
+        self.raw
+    }
+}
+
 #[derive(Debug)]
 pub struct Sampler {
     raw: vk::Sampler,
@@ -438,6 +448,7 @@ pub struct BindGroup {
     set: gpu_descriptor::DescriptorSet<vk::DescriptorSet>,
 }
 
+/// Miscellaneous allocation recycling pool for `CommandAllocator`.
 #[derive(Default)]
 struct Temp {
     marker: Vec<u8>,
@@ -467,11 +478,31 @@ impl Temp {
 pub struct CommandEncoder {
     raw: vk::CommandPool,
     device: Arc<DeviceShared>,
+
+    /// The current command buffer, if `self` is in the ["recording"]
+    /// state.
+    ///
+    /// ["recording"]: crate::CommandEncoder
+    ///
+    /// If non-`null`, the buffer is in the Vulkan "recording" state.
     active: vk::CommandBuffer,
+
+    /// What kind of pass we are currently within: compute or render.
     bind_point: vk::PipelineBindPoint,
+
+    /// Allocation recycling pool for this encoder.
     temp: Temp,
+
+    /// A pool of available command buffers.
+    ///
+    /// These are all in the Vulkan "initial" state.
     free: Vec<vk::CommandBuffer>,
+
+    /// A pool of discarded command buffers.
+    ///
+    /// These could be in any Vulkan state except "pending".
     discarded: Vec<vk::CommandBuffer>,
+
     /// If this is true, the active renderpass enabled a debug span,
     /// and needs to be disabled on renderpass close.
     rpass_debug_marker_active: bool,
@@ -481,6 +512,15 @@ pub struct CommandEncoder {
     end_of_pass_timer_query: Option<(vk::QueryPool, u32)>,
 }
 
+impl CommandEncoder {
+    /// # Safety
+    ///
+    /// - The command buffer handle must not be manually destroyed
+    pub unsafe fn raw_handle(&self) -> vk::CommandBuffer {
+        self.active
+    }
+}
+
 impl fmt::Debug for CommandEncoder {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("CommandEncoder")
@@ -519,9 +559,47 @@ pub struct QuerySet {
     raw: vk::QueryPool,
 }
 
+/// The [`Api::Fence`] type for [`vulkan::Api`].
+///
+/// This is an `enum` because there are two possible implementations of
+/// `wgpu-hal` fences on Vulkan: Vulkan fences, which work on any version of
+/// Vulkan, and Vulkan timeline semaphores, which are easier and cheaper but
+/// require non-1.0 features.
+///
+/// [`Device::create_fence`] returns a [`TimelineSemaphore`] if
+/// [`VK_KHR_timeline_semaphore`] is available and enabled, and a [`FencePool`]
+/// otherwise.
+///
+/// [`Api::Fence`]: crate::Api::Fence
+/// [`vulkan::Api`]: Api
+/// [`Device::create_fence`]: crate::Device::create_fence
+/// [`TimelineSemaphore`]: Fence::TimelineSemaphore
+/// [`VK_KHR_timeline_semaphore`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VK_KHR_timeline_semaphore
+/// [`FencePool`]: Fence::FencePool
 #[derive(Debug)]
 pub enum Fence {
+    /// A Vulkan [timeline semaphore].
+    ///
+    /// These are simpler to use than Vulkan fences, since timeline semaphores
+    /// work exactly the way [`wpgu_hal::Api::Fence`] is specified to work.
+    ///
+    /// [timeline semaphore]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-semaphores
+    /// [`wpgu_hal::Api::Fence`]: crate::Api::Fence
     TimelineSemaphore(vk::Semaphore),
+
+    /// A collection of Vulkan [fence]s, each associated with a [`FenceValue`].
+    ///
+    /// The effective [`FenceValue`] of this variant is the greater of
+    /// `last_completed` and the maximum value associated with a signalled fence
+    /// in `active`.
+    ///
+    /// Fences are available in all versions of Vulkan, but since they only have
+    /// two states, "signaled" and "unsignaled", we need to use a separate fence
+    /// for each queue submission we might want to wait for, and remember which
+    /// [`FenceValue`] each one represents.
+    ///
+    /// [fence]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-fences
+    /// [`FenceValue`]: crate::FenceValue
     FencePool {
         last_completed: crate::FenceValue,
         /// The pending fence values have to be ascending.
@@ -531,21 +609,32 @@ pub enum Fence {
 }
 
 impl Fence {
+    /// Return the highest [`FenceValue`] among the signalled fences in `active`.
+    ///
+    /// As an optimization, assume that we already know that the fence has
+    /// reached `last_completed`, and don't bother checking fences whose values
+    /// are less than that: those fences remain in the `active` array only
+    /// because we haven't called `maintain` yet to clean them up.
+    ///
+    /// [`FenceValue`]: crate::FenceValue
     fn check_active(
         device: &ash::Device,
-        mut max_value: crate::FenceValue,
+        mut last_completed: crate::FenceValue,
         active: &[(crate::FenceValue, vk::Fence)],
     ) -> Result<crate::FenceValue, crate::DeviceError> {
         for &(value, raw) in active.iter() {
             unsafe {
-                if value > max_value && device.get_fence_status(raw)? {
-                    max_value = value;
+                if value > last_completed && device.get_fence_status(raw)? {
+                    last_completed = value;
                 }
             }
         }
-        Ok(max_value)
+        Ok(last_completed)
     }
 
+    /// Return the highest signalled [`FenceValue`] for `self`.
+    ///
+    /// [`FenceValue`]: crate::FenceValue
     fn get_latest(
         &self,
         device: &ash::Device,
@@ -566,6 +655,18 @@ impl Fence {
         }
     }
 
+    /// Trim the internal state of this [`Fence`].
+    ///
+    /// This function has no externally visible effect, but you should call it
+    /// periodically to keep this fence's resource consumption under control.
+    ///
+    /// For fences using the [`FencePool`] implementation, this function
+    /// recycles fences that have been signaled. If you don't call this,
+    /// [`Queue::submit`] will just keep allocating a new Vulkan fence every
+    /// time it's called.
+    ///
+    /// [`FencePool`]: Fence::FencePool
+    /// [`Queue::submit`]: crate::Queue::submit
     fn maintain(&mut self, device: &ash::Device) -> Result<(), crate::DeviceError> {
         match *self {
             Self::TimelineSemaphore(_) => {}