14 files changed, 201 insertions, 204 deletions
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index ad9e68db6..f8e83a35b 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -2450,7 +2450,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
     )
 }
 
-/// Broadcasts 16-bit integer `a` to all all elements of returned vector.
+/// Broadcasts 16-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi16)
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index 16add3dbb..8638b3136 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -1857,7 +1857,9 @@ pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m25
 #[cfg_attr(test, assert_instr(vpmaxsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
@@ -1869,7 +1871,9 @@ pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaxsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
@@ -1881,7 +1885,9 @@ pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaxsb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
+    transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
@@ -1893,7 +1899,9 @@ pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaxuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
+    let a = a.as_u16x16();
+    let b = b.as_u16x16();
+    transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
@@ -1905,7 +1913,9 @@ pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaxud))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
+    let a = a.as_u32x8();
+    let b = b.as_u32x8();
+    transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
@@ -1917,7 +1927,9 @@ pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaxub))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -1929,7 +1941,9 @@ pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpminsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
@@ -1941,7 +1955,9 @@ pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpminsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
@@ -1953,7 +1969,9 @@ pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpminsb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
+    transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
@@ -1965,7 +1983,9 @@ pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpminuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
+    let a = a.as_u16x16();
+    let b = b.as_u16x16();
+    transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
@@ -1977,7 +1997,9 @@ pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpminud))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pminud(a.as_u32x8(), b.as_u32x8()))
+    let a = a.as_u32x8();
+    let b = b.as_u32x8();
+    transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
@@ -1989,7 +2011,9 @@ pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpminub))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pminub(a.as_u8x32(), b.as_u8x32()))
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
 }
 
 /// Creates mask from the most significant bit of each 8-bit element in `a`,
@@ -2001,7 +2025,9 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmovmskb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
-    simd_bitmask::<_, u32>(a.as_i8x32()) as i32
+    let z = i8x32::splat(0);
+    let m: i8x32 = simd_lt(a.as_i8x32(), z);
+    simd_bitmask::<_, u32>(m) as i32
 }
 
 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
@@ -3618,30 +3644,6 @@ extern "C" {
     fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
     #[link_name = "llvm.x86.avx2.maskstore.q.256"]
     fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
-    #[link_name = "llvm.x86.avx2.pmaxs.w"]
-    fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmaxs.d"]
-    fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx2.pmaxs.b"]
-    fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
-    #[link_name = "llvm.x86.avx2.pmaxu.w"]
-    fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pmaxu.d"]
-    fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
-    #[link_name = "llvm.x86.avx2.pmaxu.b"]
-    fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
-    #[link_name = "llvm.x86.avx2.pmins.w"]
-    fn pminsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmins.d"]
-    fn pminsd(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx2.pmins.b"]
-    fn pminsb(a: i8x32, b: i8x32) -> i8x32;
-    #[link_name = "llvm.x86.avx2.pminu.w"]
-    fn pminuw(a: u16x16, b: u16x16) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pminu.d"]
-    fn pminud(a: u32x8, b: u32x8) -> u32x8;
-    #[link_name = "llvm.x86.avx2.pminu.b"]
-    fn pminub(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.mpsadbw"]
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
     #[link_name = "llvm.x86.avx2.pmulhu.w"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
index e9977e018..b21ededab 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
@@ -80,7 +80,7 @@ pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
-/// to packed BF16 (16-bit) floating-point elements and and store the results in single vector
+/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
 /// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
 #[inline]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
index 3c9df3912..1099ee2cb 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -303,7 +303,7 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
 /// It then selects these bits and packs them into the output.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_bitshuffle_epi64_mask)
@@ -315,7 +315,7 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
 /// It then selects these bits and packs them into the output.
 ///
 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
@@ -330,7 +330,7 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
 /// It then selects these bits and packs them into the output.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bitshuffle_epi64_mask)
@@ -342,7 +342,7 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
 /// It then selects these bits and packs them into the output.
 ///
 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
@@ -357,7 +357,7 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
 /// It then selects these bits and packs them into the output.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bitshuffle_epi64_mask)
@@ -369,7 +369,7 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
 /// It then selects these bits and packs them into the output.
 ///
 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 49d78ed60..fbf71dfc4 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -7450,7 +7450,7 @@ pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
     transmute(simd_select_bitmask(k, r, zero))
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst.
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shufflelo_epi16&expand=5221)
 #[inline]
@@ -7501,7 +7501,7 @@ pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
     transmute(r)
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shufflelo_epi16&expand=5219)
 #[inline]
@@ -7518,7 +7518,7 @@ pub unsafe fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
     transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shufflelo_epi16&expand=5220)
 #[inline]
@@ -7532,7 +7532,7 @@ pub unsafe fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m
     transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shufflelo_epi16&expand=5216)
 #[inline]
@@ -7549,7 +7549,7 @@ pub unsafe fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shufflelo_epi16&expand=5217)
 #[inline]
@@ -7563,7 +7563,7 @@ pub unsafe fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), zero))
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shufflelo_epi16&expand=5213)
 #[inline]
@@ -7580,7 +7580,7 @@ pub unsafe fn _mm_mask_shufflelo_epi16<const IMM8: i32>(
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
 }
 
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shufflelo_epi16&expand=5214)
 #[inline]
@@ -7594,7 +7594,7 @@ pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), zero))
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst.
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shufflehi_epi16&expand=5212)
 #[inline]
@@ -7645,7 +7645,7 @@ pub unsafe fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
     transmute(r)
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shufflehi_epi16&expand=5210)
 #[inline]
@@ -7662,7 +7662,7 @@ pub unsafe fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
     transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shufflehi_epi16&expand=5211)
 #[inline]
@@ -7676,7 +7676,7 @@ pub unsafe fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m
     transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shufflehi_epi16&expand=5207)
 #[inline]
@@ -7693,7 +7693,7 @@ pub unsafe fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shufflehi_epi16&expand=5208)
 #[inline]
@@ -7707,7 +7707,7 @@ pub unsafe fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), zero))
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shufflehi_epi16&expand=5204)
 #[inline]
@@ -7724,7 +7724,7 @@ pub unsafe fn _mm_mask_shufflehi_epi16<const IMM8: i32>(
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
 }
 
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shufflehi_epi16&expand=5205)
 #[inline]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index f70a28466..0ddb51283 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -26268,7 +26268,7 @@ pub unsafe fn _mm512_set1_epi8(a: i8) -> __m512i {
     transmute(i8x64::splat(a))
 }
 
-/// Broadcast the low packed 16-bit integer from a to all all elements of dst.
+/// Broadcast the low packed 16-bit integer from a to all elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=4944)
 #[inline]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512gfni.rs b/library/stdarch/crates/core_arch/src/x86/gfni.rs
index 66fd1c2e1..679b2548a 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512gfni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/gfni.rs
@@ -65,7 +65,7 @@ extern "C" {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
     transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
@@ -80,7 +80,7 @@ pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm512_mask_gf2p8mul_epi8(
     src: __m512i,
@@ -104,7 +104,7 @@ pub unsafe fn _mm512_mask_gf2p8mul_epi8(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
     let zero = _mm512_setzero_si512().as_i8x64();
@@ -121,7 +121,7 @@ pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
     transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
@@ -136,7 +136,7 @@ pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm256_mask_gf2p8mul_epi8(
     src: __m256i,
@@ -160,7 +160,7 @@ pub unsafe fn _mm256_mask_gf2p8mul_epi8(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
     let zero = _mm256_setzero_si256().as_i8x32();
@@ -177,8 +177,8 @@ pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+#[target_feature(enable = "gfni")]
+#[cfg_attr(test, assert_instr(gf2p8mulb))]
 pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
     transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
 }
@@ -192,7 +192,7 @@ pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm_mask_gf2p8mul_epi8(
     src: __m128i,
@@ -216,7 +216,7 @@ pub unsafe fn _mm_mask_gf2p8mul_epi8(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8mul_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
     let zero = _mm_setzero_si128().as_i8x16();
@@ -234,7 +234,7 @@ pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> _
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
@@ -256,7 +256,7 @@ pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
@@ -283,7 +283,7 @@ pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
@@ -307,7 +307,7 @@ pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
@@ -329,7 +329,7 @@ pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
@@ -356,7 +356,7 @@ pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
@@ -380,8 +380,8 @@ pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[target_feature(enable = "gfni")]
+#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
     static_assert_imm8!(B);
@@ -402,7 +402,7 @@ pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
@@ -429,7 +429,7 @@ pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affine_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
@@ -455,7 +455,7 @@ pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
@@ -479,7 +479,7 @@ pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m5
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
@@ -508,7 +508,7 @@ pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
@@ -534,7 +534,7 @@ pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
@@ -558,7 +558,7 @@ pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m2
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
@@ -587,7 +587,7 @@ pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
@@ -613,8 +613,8 @@ pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[target_feature(enable = "gfni")]
+#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
     static_assert_imm8!(B);
@@ -637,7 +637,7 @@ pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
@@ -666,7 +666,7 @@ pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affineinv_epi64_epi8)
 #[inline]
-#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
@@ -847,7 +847,7 @@ mod tests {
         _mm512_loadu_si512(black_box(pointer))
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_gf2p8mul_epi8() {
         let (left, right, expected) = generate_byte_mul_test_data();
 
@@ -860,7 +860,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
         let (left, right, _expected) = generate_byte_mul_test_data();
 
@@ -879,7 +879,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_mask_gf2p8mul_epi8() {
         let (left, right, _expected) = generate_byte_mul_test_data();
 
@@ -897,7 +897,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_gf2p8mul_epi8() {
         let (left, right, expected) = generate_byte_mul_test_data();
 
@@ -910,7 +910,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
         let (left, right, _expected) = generate_byte_mul_test_data();
 
@@ -929,7 +929,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_mask_gf2p8mul_epi8() {
         let (left, right, _expected) = generate_byte_mul_test_data();
 
@@ -947,7 +947,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_gf2p8mul_epi8() {
         let (left, right, expected) = generate_byte_mul_test_data();
 
@@ -960,7 +960,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_maskz_gf2p8mul_epi8() {
         let (left, right, _expected) = generate_byte_mul_test_data();
 
@@ -979,7 +979,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_mask_gf2p8mul_epi8() {
         let (left, right, _expected) = generate_byte_mul_test_data();
 
@@ -997,7 +997,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
         let identity: i64 = 0x01_02_04_08_10_20_40_80;
         const IDENTITY_BYTE: i32 = 0;
@@ -1031,7 +1031,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1053,7 +1053,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1074,7 +1074,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
         let identity: i64 = 0x01_02_04_08_10_20_40_80;
         const IDENTITY_BYTE: i32 = 0;
@@ -1108,7 +1108,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1130,7 +1130,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1151,7 +1151,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_gf2p8affine_epi64_epi8() {
         let identity: i64 = 0x01_02_04_08_10_20_40_80;
         const IDENTITY_BYTE: i32 = 0;
@@ -1185,7 +1185,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1206,7 +1206,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1227,7 +1227,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
         let identity: i64 = 0x01_02_04_08_10_20_40_80;
         const IDENTITY_BYTE: i32 = 0;
@@ -1271,7 +1271,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1293,7 +1293,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw")]
+    #[simd_test(enable = "gfni,avx512bw")]
     unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1315,7 +1315,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
         let identity: i64 = 0x01_02_04_08_10_20_40_80;
         const IDENTITY_BYTE: i32 = 0;
@@ -1359,7 +1359,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1381,7 +1381,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1403,7 +1403,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
         let identity: i64 = 0x01_02_04_08_10_20_40_80;
         const IDENTITY_BYTE: i32 = 0;
@@ -1447,7 +1447,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
@@ -1469,7 +1469,7 @@ mod tests {
         }
     }
 
-    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
     unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
         const CONSTANT_BYTE: i32 = 0x63;
         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
index 6b50e95b2..37045e40e 100644
--- a/library/stdarch/crates/core_arch/src/x86/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -835,17 +835,17 @@ pub use self::avx512vnni::*;
 mod avx512bitalg;
 pub use self::avx512bitalg::*;
 
-mod avx512gfni;
-pub use self::avx512gfni::*;
+mod gfni;
+pub use self::gfni::*;
 
 mod avx512vpopcntdq;
 pub use self::avx512vpopcntdq::*;
 
-mod avx512vaes;
-pub use self::avx512vaes::*;
+mod vaes;
+pub use self::vaes::*;
 
-mod avx512vpclmulqdq;
-pub use self::avx512vpclmulqdq::*;
+mod vpclmulqdq;
+pub use self::vpclmulqdq::*;
 
 mod bt;
 pub use self::bt::*;
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
index 03c3a14a5..f21288970 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -1080,10 +1080,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
 #[inline]
 #[target_feature(enable = "sse")]
-// FIXME: LLVM9 trunk has the following bug:
-// https://github.com/rust-lang/stdarch/issues/794
-// so we only temporarily test this on i686 and x86_64 but not on i586:
-#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))]
+#[cfg_attr(test, assert_instr(movmskps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
     movmskps(a)
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index 3e79b3539..cde4bc316 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -203,7 +203,9 @@ pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaxsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmaxsw(a.as_i16x8(), b.as_i16x8()))
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
@@ -215,7 +217,9 @@ pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaxub))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmaxub(a.as_u8x16(), b.as_u8x16()))
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -227,7 +231,9 @@ pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pminsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pminsw(a.as_i16x8(), b.as_i16x8()))
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
@@ -239,7 +245,9 @@ pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pminub))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pminub(a.as_u8x16(), b.as_u8x16()))
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`.
@@ -1378,7 +1386,9 @@ pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 #[cfg_attr(test, assert_instr(pmovmskb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
-    simd_bitmask::<_, u16>(a.as_i8x16()) as u32 as i32
+    let z = i8x16::splat(0);
+    let m: i8x16 = simd_lt(a.as_i8x16(), z);
+    simd_bitmask::<_, u16>(m) as u32 as i32
 }
 
 /// Shuffles 32-bit integers in `a` using the control in `IMM8`.
@@ -1409,7 +1419,7 @@ pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// `IMM8`.
 ///
 /// Put the results in the high 64 bits of the returned vector, with the low 64
-/// bits being copied from from `a`.
+/// bits being copied from `a`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflehi_epi16)
 #[inline]
@@ -1441,7 +1451,7 @@ pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// `IMM8`.
 ///
 /// Put the results in the low 64 bits of the returned vector, with the high 64
-/// bits being copied from from `a`.
+/// bits being copied from `a`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflelo_epi16)
 #[inline]
@@ -2796,14 +2806,6 @@ extern "C" {
     fn pavgw(a: u16x8, b: u16x8) -> u16x8;
     #[link_name = "llvm.x86.sse2.pmadd.wd"]
     fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
-    #[link_name = "llvm.x86.sse2.pmaxs.w"]
-    fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.pmaxu.b"]
-    fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
-    #[link_name = "llvm.x86.sse2.pmins.w"]
-    fn pminsw(a: i16x8, b: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.pminu.b"]
-    fn pminub(a: u8x16, b: u8x16) -> u8x16;
     #[link_name = "llvm.x86.sse2.pmulh.w"]
     fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
     #[link_name = "llvm.x86.sse2.pmulhu.w"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 7c59f2702..3162ad7d9 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -281,7 +281,9 @@ pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaxsb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
@@ -293,7 +295,9 @@ pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaxuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
@@ -305,7 +309,9 @@ pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaxsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
@@ -317,7 +323,9 @@ pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaxud))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
@@ -329,7 +337,9 @@ pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pminsb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
@@ -341,7 +351,9 @@ pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pminuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
@@ -353,7 +365,9 @@ pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pminsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
@@ -365,7 +379,9 @@ pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pminud))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pminud(a.as_u32x4(), b.as_u32x4()))
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1122,22 +1138,6 @@ extern "C" {
     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
     #[link_name = "llvm.x86.sse41.insertps"]
     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.pmaxsb"]
-    fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
-    #[link_name = "llvm.x86.sse41.pmaxuw"]
-    fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.pmaxsd"]
-    fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sse41.pmaxud"]
-    fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
-    #[link_name = "llvm.x86.sse41.pminsb"]
-    fn pminsb(a: i8x16, b: i8x16) -> i8x16;
-    #[link_name = "llvm.x86.sse41.pminuw"]
-    fn pminuw(a: u16x8, b: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.pminsd"]
-    fn pminsd(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sse41.pminud"]
-    fn pminud(a: u32x4, b: u32x4) -> u32x4;
     #[link_name = "llvm.x86.sse41.packusdw"]
     fn packusdw(a: i32x4, b: i32x4) -> u16x8;
     #[link_name = "llvm.x86.sse41.dppd"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse42.rs b/library/stdarch/crates/core_arch/src/x86/sse42.rs
index f474b0671..4eb12480b 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse42.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse42.rs
@@ -614,7 +614,7 @@ mod tests {
     use crate::core_arch::x86::*;
     use std::ptr;
 
-    // Currently one cannot `load` a &[u8] that is is less than 16
+    // Currently one cannot `load` a &[u8] that is less than 16
     // in length. This makes loading strings less than 16 in length
     // a bit difficult. Rather than `load` and mutate the __m128i,
     // it is easier to memcpy the given string to a local slice with
@@ -623,11 +623,7 @@ mod tests {
     unsafe fn str_to_m128i(s: &[u8]) -> __m128i {
         assert!(s.len() <= 16);
         let slice = &mut [0u8; 16];
-        ptr::copy_nonoverlapping(
-            s.get_unchecked(0) as *const u8 as *const u8,
-            slice.get_unchecked_mut(0) as *mut u8 as *mut u8,
-            s.len(),
-        );
+        ptr::copy_nonoverlapping(s.as_ptr(), slice.as_mut_ptr(), s.len());
         _mm_loadu_si128(slice.as_ptr() as *const _)
     }
 
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vaes.rs b/library/stdarch/crates/core_arch/src/x86/vaes.rs
index 676de312b..e09f8a113 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512vaes.rs
+++ b/library/stdarch/crates/core_arch/src/x86/vaes.rs
@@ -38,7 +38,7 @@ extern "C" {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesenc_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512vl")]
+#[target_feature(enable = "vaes")]
 #[cfg_attr(test, assert_instr(vaesenc))]
 pub unsafe fn _mm256_aesenc_epi128(a: __m256i, round_key: __m256i) -> __m256i {
     aesenc_256(a, round_key)
@@ -49,7 +49,7 @@ pub unsafe fn _mm256_aesenc_epi128(a: __m256i, round_key: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesenclast_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512vl")]
+#[target_feature(enable = "vaes")]
 #[cfg_attr(test, assert_instr(vaesenclast))]
 pub unsafe fn _mm256_aesenclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
     aesenclast_256(a, round_key)
@@ -60,7 +60,7 @@ pub unsafe fn _mm256_aesenclast_epi128(a: __m256i, round_key: __m256i) -> __m256
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesdec_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512vl")]
+#[target_feature(enable = "vaes")]
 #[cfg_attr(test, assert_instr(vaesdec))]
 pub unsafe fn _mm256_aesdec_epi128(a: __m256i, round_key: __m256i) -> __m256i {
     aesdec_256(a, round_key)
@@ -71,7 +71,7 @@ pub unsafe fn _mm256_aesdec_epi128(a: __m256i, round_key: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesdeclast_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512vl")]
+#[target_feature(enable = "vaes")]
 #[cfg_attr(test, assert_instr(vaesdeclast))]
 pub unsafe fn _mm256_aesdeclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
     aesdeclast_256(a, round_key)
@@ -82,7 +82,7 @@ pub unsafe fn _mm256_aesdeclast_epi128(a: __m256i, round_key: __m256i) -> __m256
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesenc_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512f")]
+#[target_feature(enable = "vaes,avx512f")]
 #[cfg_attr(test, assert_instr(vaesenc))]
 pub unsafe fn _mm512_aesenc_epi128(a: __m512i, round_key: __m512i) -> __m512i {
     aesenc_512(a, round_key)
@@ -93,7 +93,7 @@ pub unsafe fn _mm512_aesenc_epi128(a: __m512i, round_key: __m512i) -> __m512i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesenclast_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512f")]
+#[target_feature(enable = "vaes,avx512f")]
 #[cfg_attr(test, assert_instr(vaesenclast))]
 pub unsafe fn _mm512_aesenclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
     aesenclast_512(a, round_key)
@@ -104,7 +104,7 @@ pub unsafe fn _mm512_aesenclast_epi128(a: __m512i, round_key: __m512i) -> __m512
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesdec_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512f")]
+#[target_feature(enable = "vaes,avx512f")]
 #[cfg_attr(test, assert_instr(vaesdec))]
 pub unsafe fn _mm512_aesdec_epi128(a: __m512i, round_key: __m512i) -> __m512i {
     aesdec_512(a, round_key)
@@ -115,7 +115,7 @@ pub unsafe fn _mm512_aesdec_epi128(a: __m512i, round_key: __m512i) -> __m512i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesdeclast_epi128)
 #[inline]
-#[target_feature(enable = "avx512vaes,avx512f")]
+#[target_feature(enable = "vaes,avx512f")]
 #[cfg_attr(test, assert_instr(vaesdeclast))]
 pub unsafe fn _mm512_aesdeclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
     aesdeclast_512(a, round_key)
@@ -138,7 +138,7 @@ mod tests {
     // ideally we'd be using quickcheck here instead
 
     #[target_feature(enable = "avx2")]
-    unsafe fn helper_for_256_avx512vaes(
+    unsafe fn helper_for_256_vaes(
         linear: unsafe fn(__m128i, __m128i) -> __m128i,
         vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
     ) {
@@ -187,7 +187,7 @@ mod tests {
         setup_state_key(_mm512_broadcast_i32x4)
     }
 
-    #[simd_test(enable = "avx512vaes,avx512vl")]
+    #[simd_test(enable = "vaes,avx512vl")]
     unsafe fn test_mm256_aesdec_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
         let (a, k) = setup_state_key_256();
@@ -196,10 +196,10 @@ mod tests {
         let r = _mm256_aesdec_epi128(a, k);
         assert_eq_m256i(r, e);
 
-        helper_for_256_avx512vaes(_mm_aesdec_si128, _mm256_aesdec_epi128);
+        helper_for_256_vaes(_mm_aesdec_si128, _mm256_aesdec_epi128);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512vl")]
+    #[simd_test(enable = "vaes,avx512vl")]
     unsafe fn test_mm256_aesdeclast_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
         let (a, k) = setup_state_key_256();
@@ -208,10 +208,10 @@ mod tests {
         let r = _mm256_aesdeclast_epi128(a, k);
         assert_eq_m256i(r, e);
 
-        helper_for_256_avx512vaes(_mm_aesdeclast_si128, _mm256_aesdeclast_epi128);
+        helper_for_256_vaes(_mm_aesdeclast_si128, _mm256_aesdeclast_epi128);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512vl")]
+    #[simd_test(enable = "vaes,avx512vl")]
     unsafe fn test_mm256_aesenc_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
         // they are repeated appropriately
@@ -221,10 +221,10 @@ mod tests {
         let r = _mm256_aesenc_epi128(a, k);
         assert_eq_m256i(r, e);
 
-        helper_for_256_avx512vaes(_mm_aesenc_si128, _mm256_aesenc_epi128);
+        helper_for_256_vaes(_mm_aesenc_si128, _mm256_aesenc_epi128);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512vl")]
+    #[simd_test(enable = "vaes,avx512vl")]
     unsafe fn test_mm256_aesenclast_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
         let (a, k) = setup_state_key_256();
@@ -233,11 +233,11 @@ mod tests {
         let r = _mm256_aesenclast_epi128(a, k);
         assert_eq_m256i(r, e);
 
-        helper_for_256_avx512vaes(_mm_aesenclast_si128, _mm256_aesenclast_epi128);
+        helper_for_256_vaes(_mm_aesenclast_si128, _mm256_aesenclast_epi128);
     }
 
     #[target_feature(enable = "avx512f")]
-    unsafe fn helper_for_512_avx512vaes(
+    unsafe fn helper_for_512_vaes(
         linear: unsafe fn(__m128i, __m128i) -> __m128i,
         vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
     ) {
@@ -282,7 +282,7 @@ mod tests {
         assert_eq_m128i(_mm512_extracti32x4_epi32::<3>(r), e_decomp[3]);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512f")]
+    #[simd_test(enable = "vaes,avx512f")]
     unsafe fn test_mm512_aesdec_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
         let (a, k) = setup_state_key_512();
@@ -291,10 +291,10 @@ mod tests {
         let r = _mm512_aesdec_epi128(a, k);
         assert_eq_m512i(r, e);
 
-        helper_for_512_avx512vaes(_mm_aesdec_si128, _mm512_aesdec_epi128);
+        helper_for_512_vaes(_mm_aesdec_si128, _mm512_aesdec_epi128);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512f")]
+    #[simd_test(enable = "vaes,avx512f")]
     unsafe fn test_mm512_aesdeclast_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
         let (a, k) = setup_state_key_512();
@@ -303,10 +303,10 @@ mod tests {
         let r = _mm512_aesdeclast_epi128(a, k);
         assert_eq_m512i(r, e);
 
-        helper_for_512_avx512vaes(_mm_aesdeclast_si128, _mm512_aesdeclast_epi128);
+        helper_for_512_vaes(_mm_aesdeclast_si128, _mm512_aesdeclast_epi128);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512f")]
+    #[simd_test(enable = "vaes,avx512f")]
     unsafe fn test_mm512_aesenc_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
         let (a, k) = setup_state_key_512();
@@ -315,10 +315,10 @@ mod tests {
         let r = _mm512_aesenc_epi128(a, k);
         assert_eq_m512i(r, e);
 
-        helper_for_512_avx512vaes(_mm_aesenc_si128, _mm512_aesenc_epi128);
+        helper_for_512_vaes(_mm_aesenc_si128, _mm512_aesenc_epi128);
     }
 
-    #[simd_test(enable = "avx512vaes,avx512f")]
+    #[simd_test(enable = "vaes,avx512f")]
     unsafe fn test_mm512_aesenclast_epi128() {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
         let (a, k) = setup_state_key_512();
@@ -327,6 +327,6 @@ mod tests {
         let r = _mm512_aesenclast_epi128(a, k);
         assert_eq_m512i(r, e);
 
-        helper_for_512_avx512vaes(_mm_aesenclast_si128, _mm512_aesenclast_epi128);
+        helper_for_512_vaes(_mm_aesenclast_si128, _mm512_aesenclast_epi128);
     }
 }
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs b/library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs
index 9bfeb903a..ea76708b8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs
@@ -32,7 +32,7 @@ extern "C" {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
 #[inline]
-#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
+#[target_feature(enable = "vpclmulqdq,avx512f")]
 // technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
 #[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -50,7 +50,7 @@ pub unsafe fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i)
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
 #[inline]
-#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
+#[target_feature(enable = "vpclmulqdq")]
 #[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
@@ -121,7 +121,7 @@ mod tests {
 
     // this function tests one of the possible 4 instances
     // with different inputs across lanes
-    #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
+    #[target_feature(enable = "vpclmulqdq,avx512f")]
     unsafe fn verify_512_helper(
         linear: unsafe fn(__m128i, __m128i) -> __m128i,
         vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
@@ -162,7 +162,7 @@ mod tests {
 
     // this function tests one of the possible 4 instances
     // with different inputs across lanes for the VL version
-    #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
+    #[target_feature(enable = "vpclmulqdq,avx512vl")]
     unsafe fn verify_256_helper(
         linear: unsafe fn(__m128i, __m128i) -> __m128i,
         vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
@@ -204,7 +204,7 @@ mod tests {
         unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
     }
 
-    #[simd_test(enable = "avx512vpclmulqdq,avx512f")]
+    #[simd_test(enable = "vpclmulqdq,avx512f")]
     unsafe fn test_mm512_clmulepi64_epi128() {
         verify_kat_pclmul!(
             _mm512_broadcast_i32x4,
@@ -230,7 +230,7 @@ mod tests {
         );
     }
 
-    #[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
+    #[simd_test(enable = "vpclmulqdq,avx512vl")]
     unsafe fn test_mm256_clmulepi64_epi128() {
         verify_kat_pclmul!(
             _mm256_broadcastsi128_si256,