2 files changed, 396 insertions, 303 deletions
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec
index 95fbc354c..f2c1e200d 100644
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@@ -14,7 +14,7 @@
 // Sections start with EXACTLY three slashes followed
 // by AT LEAST one space. Sections are used for two things:
 //
-// 1) they serve as the doc comment for the given intrinics.
+// 1) they serve as the doc comment for the given intrinsics.
 // 2) they reset all variables (name, fn, etc.)
 //
 // # Variables
@@ -29,16 +29,16 @@
 //           the function will exclusively be generated for
 //           aarch64.
 //           This is used to generate both aarch64 specific and
-//           shared intrinics by first only specifying th aarch64
+//           shared intrinsics by first only specifying th aarch64
 //           variant then the arm variant.
 //
-// arm     - The arm v7 intrinics used to checked for arm code
+// arm     - The arm v7 intrinsics used to checked for arm code
 //           generation. All neon functions available in arm are
-//           also available in aarch64. If no aarch64 intrinic was
+//           also available in aarch64. If no aarch64 intrinsic was
 //           set they are assumed to be the same.
-//           Intrinics ending with a `.` will have a size suffixes
+//           Intrinsics ending with a `.` will have a size suffixes
 //           added (such as `i8` or `i64`) that is not sign specific
-//           Intrinics ending with a `.s` will have a size suffixes
+//           Intrinsics ending with a `.s` will have a size suffixes
 //           added (such as `s8` or `u64`) that is sign specific
 //
 // a       - First input for tests, it gets scaled to the size of
@@ -218,8 +218,8 @@ generate int32x2_t:int32x2_t:int64x2_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, {vabd_u8, c, d}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
@@ -231,8 +231,8 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
-multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, d:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, {vabd_u16, c, d}
 a = 1, 2, 3, 4, 8, 9, 11, 12
 b = 10, 10, 10, 10, 10, 10, 10, 10
@@ -244,8 +244,8 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
-multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle!, d:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, {vabd_u32, c, d}
 a = 1, 2, 3, 4
 b = 10, 10, 10, 10
@@ -257,8 +257,8 @@ generate uint32x4_t:uint32x4_t:uint64x2_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -271,8 +271,8 @@ generate int8x16_t:int8x16_t:int16x8_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
-multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, d:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4, 9, 10, 11, 12
@@ -285,8 +285,8 @@ generate int16x8_t:int16x8_t:int32x4_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
-multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle!, d:int32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4
@@ -978,7 +978,7 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@@ -995,7 +995,7 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0., 0.5, 0., 0.
 n = 0:1
@@ -1010,8 +1010,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
+multi_fn = simd_shuffle!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@@ -1028,8 +1028,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
+multi_fn = simd_shuffle!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0., 0.5, 0., 0.
 n = 0:1
@@ -1044,8 +1044,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@@ -1062,8 +1062,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1:0
@@ -1078,8 +1078,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0.5, 0., 0., 0.
 n = 1:0
@@ -1148,7 +1148,7 @@ generate float32x2_t:float64x2_t
 /// Floating-point convert to higher precision long
 name = vcvt_high
 noq-double-suffixes
-multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle!, b:float32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, b
 a = -1.2, 1.2, 2.3, 3.4
 validate 2.3f32 as f64, 3.4f32 as f64
@@ -1169,7 +1169,7 @@ generate float64x2_t:float32x2_t
 /// Floating-point convert to lower precision narrow
 name = vcvt_high
 noq-double-suffixes
-multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3]
+multi_fn = simd_shuffle!, a, {simd_cast, b}, [0, 1, 2, 3]
 a = -1.2, 1.2
 b = -2.3, 3.4
 validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
@@ -1200,7 +1200,7 @@ generate f64:f32
 /// Floating-point convert to lower precision narrow, rounding to odd
 name = vcvtx_high
 noq-double-suffixes
-multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
+multi_fn = simd_shuffle!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
 a = -1.0, 2.0
 b = -3.0, 4.0
 validate -1.0, 2.0, -3.0, 4.0
@@ -1428,7 +1428,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle!, a, a, {dup-out_len-N as u32}
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@@ -1454,7 +1454,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle!, a, a, {dup-out_len-N as u32}
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@@ -1468,7 +1468,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle!, a, a, {dup-out_len-N as u32}
 a = 1., 1., 1., 4.
 n = HFLEN
 validate 1., 1., 1., 1.
@@ -1569,7 +1569,7 @@ generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle!, a, b, {asc-n-out_len}
 a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 n = LEN_M1
@@ -1583,7 +1583,7 @@ generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle!, a, b, {asc-n-out_len}
 a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 n = LEN_M1
@@ -1599,7 +1599,7 @@ generate int64x2_t, uint64x2_t
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle!, a, b, {asc-n-out_len}
 a = 1., 1., 1., 1.
 b = 2., 2., 2., 2.,
 n = LEN_M1
@@ -1669,7 +1669,7 @@ name = vmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmla-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1688,7 +1688,7 @@ name = vmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmla-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 0., 1., 2., 3.
 b = 2., 2., 2., 2.
 c = 0., 3., 0., 0.
@@ -1743,7 +1743,7 @@ name = vmlal_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlal-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1761,8 +1761,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
 /// Signed multiply-add long
 name = vmlal_high
 no-q
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlal-noqself-noext, a, b, c
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1776,8 +1776,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
 /// Unsigned multiply-add long
 name = vmlal_high
 no-q
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlal-noqself-noext, a, b, c
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1807,7 +1807,7 @@ name = vmlal_high_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1879,7 +1879,7 @@ name = vmls
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmls-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1898,7 +1898,7 @@ name = vmls
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmls-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 6., 7., 8., 9.
 b = 2., 2., 2., 2.
 c = 0., 3., 0., 0.
@@ -1953,7 +1953,7 @@ name = vmlsl_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlsl-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1971,8 +1971,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
 /// Signed multiply-subtract long
 name = vmlsl_high
 no-q
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlsl-noqself-noext, a, b, c
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1986,8 +1986,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
 /// Unsigned multiply-subtract long
 name = vmlsl_high
 no-q
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlsl-noqself-noext, a, b, c
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -2017,7 +2017,7 @@ name = vmlsl_high_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}}
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -2035,7 +2035,7 @@ generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint
 name = vmovn_high
 no-q
 multi_fn = simd_cast, c:in_t0, b
-multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, c, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 4, 5
 b = 2, 3, 4, 5, 12, 13, 14, 15
 validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
@@ -3483,7 +3483,7 @@ name = vsudot
 out-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_dot-LANE
-multi_fn = simd_shuffle-in_len-!, c:unsigned, c, c, {base-4-LANE}
+multi_fn = simd_shuffle!, c:unsigned, c, c, {base-4-LANE}
 multi_fn = vsudot-outlane-_, a, b, c
 a = 1, 2, 1, 2
 b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
@@ -3567,7 +3567,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
+multi_fn = simd_mul, a, {simd_shuffle!, b, b, {dup-out_len-LANE as u32}}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@@ -3599,7 +3599,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
+multi_fn = simd_mul, a, {simd_shuffle!, b, b, {dup-out_len-LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@@ -3652,8 +3652,8 @@ generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:i
 /// Signed multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@@ -3678,8 +3678,8 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
 /// Unsigned multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@@ -3720,8 +3720,8 @@ generate p64:p64:p128
 /// Polynomial multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
@@ -3761,7 +3761,7 @@ generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
 name = vmull_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmull-in0-noext, a, {simd_shuffle!, b, b, {dup-in0_len-LANE as u32}}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@@ -3792,7 +3792,7 @@ generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
 name = vmull_high_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmull_high-noqself-noext, a, {simd_shuffle!, b, b, {dup-in0_len-LANE as u32}}
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@@ -3834,7 +3834,7 @@ name = vmulx
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmulx-in0-noext, a, {simd_shuffle!, b, b, {dup-in0_len-LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@@ -4196,7 +4196,7 @@ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
 name = vsubhn_high
 no-q
 multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
-multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, d, {asc-0-out_len}
 a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
 b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
 c = 1, 0, 1, 0, 1, 0, 1, 0
@@ -4252,7 +4252,7 @@ generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 12, 13, 14, 15, 16
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
@@ -4264,7 +4264,7 @@ generate int16x8_t:int8x16_t:int16x8_t
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, c:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11
 b = 0, 1, 2, 3, 8, 9, 10, 11
@@ -4276,7 +4276,7 @@ generate int32x4_t:int16x8_t:int32x4_t
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, c:int32x2_t, b, b, [2, 3]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9
 b = 6, 7, 8, 9
@@ -4288,7 +4288,7 @@ generate int64x2_t:int32x4_t:int64x2_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11, 12, 13, 14, 15
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -4300,7 +4300,7 @@ generate uint16x8_t:uint8x16_t:uint16x8_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, c:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11
 b = 0, 1, 2, 3, 8, 9, 10, 11
@@ -4312,7 +4312,7 @@ generate uint32x4_t:uint16x8_t:uint32x4_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, c:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9
 b = 6, 7, 8, 9
@@ -4354,9 +4354,9 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f
 
@@ -4370,9 +4370,9 @@ generate int8x16_t:int8x16_t:int16x8_t
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, c:int16x4_t, a, a, [4, 5, 6, 7]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, e:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f
 
@@ -4386,9 +4386,9 @@ generate int16x8_t:int16x8_t:int32x4_t
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle!, c:int32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, e:int32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f
 
@@ -4402,9 +4402,9 @@ generate int32x4_t:int32x4_t:int64x2_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f
 
@@ -4418,9 +4418,9 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, c:uint16x4_t, a, a, [4, 5, 6, 7]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, e:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f
 
@@ -4434,9 +4434,9 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle!, c:uint32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, e:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f
 
@@ -4545,7 +4545,7 @@ name = vcmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_rot-LANE
-multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE}
 multi_fn = vcmla-self-noext, a, b, c
 a = 1., -1., 1., -1.
 b = -1., 1., -1., 1.
@@ -4563,7 +4563,7 @@ name = vcmla_rot90
 rot-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_rot-LANE
-multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE}
 multi_fn = vcmla_rot90-rot-noext, a, b, c
 a = 1., -1., 1., -1.
 b = -1., 1., -1., 1.
@@ -4581,7 +4581,7 @@ name = vcmla_rot180
 rot-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_rot-LANE
-multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE}
 multi_fn = vcmla_rot180-rot-noext, a, b, c
 a = 1., -1., 1., -1.
 b = -1., 1., -1., 1.
@@ -4599,7 +4599,7 @@ name = vcmla_rot270
 rot-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_rot-LANE
-multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE}
 multi_fn = vcmla_rot270-rot-noext, a, b, c
 a = 1., -1., 1., -1.
 b = -1., 1., -1., 1.
@@ -4634,7 +4634,7 @@ name = vdot
 out-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_dot-LANE
-multi_fn = simd_shuffle-in_len-!, c:in_t, c, c, {base-4-LANE}
+multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE}
 multi_fn = vdot-out-noext, a, b, c
 a = 1, 2, 1, 2
 b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
@@ -4820,7 +4820,7 @@ generate float32x4_t:f32
 /// Vector move
 name = vmovl_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle!, a:half, a, a, {asc-halflen-halflen}
 multi_fn = vmovl-noqself-noext, a
 a = 1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10
 validate 3, 4, 5, 6, 7, 8, 9, 10
@@ -4949,8 +4949,8 @@ generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
 /// Signed saturating doubling multiply long
 name = vqdmull_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen}
+multi_fn = simd_shuffle!, a:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle!, b:half, b, b, {asc-halflen-halflen}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 1, 2, 5, 6, 5, 6, 7, 8
@@ -4962,7 +4962,7 @@ generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
 /// Signed saturating doubling multiply long
 name = vqdmull_high_n
 no-q
-multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle!, a:in_ntt, a, a, {asc-out_len-out_len}
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
 multi_fn = vqdmull-in_ntt-noext, a, b
 a = 0, 2, 8, 10, 8, 10, 12, 14
@@ -4976,7 +4976,7 @@ generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
 name = vqdmull_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle!, b:in_t0, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 1, 2, 3, 4
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -5021,8 +5021,8 @@ generate i32:int32x2_t:i64, i32:int32x4_t:i64
 name = vqdmull_high_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len}
-multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle!, a:in_t, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle!, b:in_t, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-self-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -5036,8 +5036,8 @@ generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t
 name = vqdmull_high_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len}
-multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle!, a:half, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle!, b:half, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -5418,7 +5418,7 @@ generate u64:u32
 /// Signed saturating extract narrow
 name = vqmovn_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len}
 a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
@@ -5452,7 +5452,7 @@ generate i16:u8, i32:u16, i64:u32
 /// Signed saturating extract unsigned narrow
 name = vqmovun_high
 no-q
-multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len}
 a = 0, 0, 0, 0, 0, 0, 0, 0
 b = -1, -1, -1, -1, -1, -1, -1, -1
 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -5499,7 +5499,7 @@ name = vqrdmulh
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32}
+multi_fn = simd_shuffle!, b:out_t, b, b, {dup-out_len-LANE as u32}
 multi_fn = vqrdmulh-out-noext, a, b
 a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 b = 0, 2, 0, 0, 0, 0, 0, 0,
@@ -5557,7 +5557,7 @@ name = vqrdmlah
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
 multi_fn = vqrdmlah-out-noext, a, b, c
 a = 1, 1, 1, 1, 1, 1, 1, 1
 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
@@ -5618,7 +5618,7 @@ name = vqrdmlsh
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
 multi_fn = vqrdmlsh-out-noext, a, b, c
 a = 1, 1, 1, 1, 1, 1, 1, 1
 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
@@ -5740,7 +5740,7 @@ name = vqrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@@ -5787,7 +5787,7 @@ name = vqrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@@ -5834,7 +5834,7 @@ name = vqrshrun_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@@ -6020,7 +6020,7 @@ name = vqshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -6067,7 +6067,7 @@ name = vqshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -6113,7 +6113,7 @@ name = vqshrun_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -6574,7 +6574,7 @@ name = vrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -6673,7 +6673,7 @@ generate uint16x8_t:uint16x8_t:uint8x8_t, uint32x4_t:uint32x4_t:uint16x4_t, uint
 name = vrsubhn_high
 no-q
 multi_fn = vrsubhn-noqself-noext, x:in_t0, b, c
-multi_fn = simd_shuffle-out_len-!, a, x, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, x, {asc-0-out_len}
 a = 1, 2, 0, 0, 0, 0, 0, 0
 b = 1, 2, 3, 4, 5, 6, 7, 8
 c = 1, 2, 3, 4, 5, 6, 7, 8
@@ -6841,7 +6841,7 @@ name = vshll_high_n
 no-q
 constn = N
 multi_fn = static_assert-N-0-bits
-multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle!, b:half, a, a, {asc-halflen-halflen}
 multi_fn = vshll_n-noqself-::<N>, b
 a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8
 n = 2
@@ -6889,7 +6889,7 @@ name = vshrn_high_n
 no-q
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 1, 2, 5, 6, 5, 6, 7, 8
 b = 20, 24, 28, 32, 52, 56, 60, 64
 n = 2
@@ -7087,8 +7087,8 @@ generate float32x2_t, float32x4_t
 
 /// Transpose elements
 name = vtrn
-multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len}
-multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle!, b1:in_t, a, b, {transpose-2-in_len}
 multi_fn = transmute, (a1, b1)
 a = 0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30
 b = 1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31
@@ -7104,8 +7104,8 @@ generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
 
 /// Transpose elements
 name = vtrn
-multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len}
-multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle!, b1:in_t, a, b, {transpose-2-in_len}
 multi_fn = transmute, (a1, b1)
 a = 0., 2., 2., 6.
 b = 1., 3., 3., 7.
@@ -7119,7 +7119,7 @@ generate float32x4_t:float32x4_t:float32x4x2_t
 
 /// Transpose vectors
 name = vtrn1
-multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle!, a, b, {transpose-1-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
@@ -7132,7 +7132,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
 
 /// Transpose vectors
 name = vtrn1
-multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle!, a, b, {transpose-1-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 0., 1., 4., 5., 8., 9., 12., 13.
@@ -7145,7 +7145,7 @@ generate float32x2_t, float64x2_t
 
 /// Transpose vectors
 name = vtrn2
-multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle!, a, b, {transpose-2-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
@@ -7158,7 +7158,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
 
 /// Transpose vectors
 name = vtrn2
-multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle!, a, b, {transpose-2-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 2., 3., 6., 7., 10., 11., 14., 15.
@@ -7171,8 +7171,8 @@ generate float32x2_t, float64x2_t
 
 /// Zip vectors
 name = vzip
-multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len}
-multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle!, a0:in_t, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle!, b0:in_t, a, b, {zip-2-in_len}
 multi_fn = transmute, (a0, b0)
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
@@ -7193,8 +7193,8 @@ generate poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t
 
 /// Zip vectors
 name = vzip
-multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len}
-multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle!, a0:in_t, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle!, b0:in_t, a, b, {zip-2-in_len}
 multi_fn = transmute, (a0, b0)
 a = 1., 2., 3., 4.
 b = 5., 6., 7., 8.
@@ -7209,7 +7209,7 @@ generate float32x4_t:float32x4_t:float32x4x2_t
 
 /// Zip vectors
 name = vzip1
-multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle!, a, b, {zip-1-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -7219,7 +7219,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4
 
 /// Zip vectors
 name = vzip1
-multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle!, a, b, {zip-1-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 0., 1., 2., 3., 4., 5., 6., 7.
@@ -7229,7 +7229,7 @@ generate float32x2_t, float32x4_t, float64x2_t
 
 /// Zip vectors
 name = vzip2
-multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle!, a, b, {zip-2-in_len}
 a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31
 validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -7239,7 +7239,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4
 
 /// Zip vectors
 name = vzip2
-multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle!, a, b, {zip-2-in_len}
 a = 0., 8., 8., 10., 8., 10., 12., 14.
 b = 1., 9., 9., 11., 9., 11., 13., 15.
 validate 8., 9., 10., 11., 12., 13., 14., 15.
@@ -7249,8 +7249,8 @@ generate float32x2_t, float32x4_t, float64x2_t
 
 /// Unzip vectors
 name = vuzp
-multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len}
-multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle!, a0:in_t, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle!, b0:in_t, a, b, {unzip-2-in_len}
 multi_fn = transmute, (a0, b0)
 a = 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16
 b = 2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32
@@ -7267,8 +7267,8 @@ generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
 
 /// Unzip vectors
 name = vuzp
-multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len}
-multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle!, a0:in_t, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle!, b0:in_t, a, b, {unzip-2-in_len}
 multi_fn = transmute, (a0, b0)
 a = 1., 2., 2., 4.
 b = 2., 6., 6., 8.
@@ -7283,7 +7283,7 @@ generate float32x4_t:float32x4_t:float32x4x2_t
 
 /// Unzip vectors
 name = vuzp1
-multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle!, a, b, {unzip-1-in_len}
 a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0
 b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0
 validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16
@@ -7296,7 +7296,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
 
 /// Unzip vectors
 name = vuzp1
-multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle!, a, b, {unzip-1-in_len}
 a = 0., 8., 1., 9., 4., 12., 5., 13.
 b = 1., 10., 3., 11., 6., 14., 7., 15.
 validate 0., 1., 1., 3., 4., 5., 6., 7.
@@ -7309,7 +7309,7 @@ generate float32x2_t, float64x2_t
 
 /// Unzip vectors
 name = vuzp2
-multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle!, a, b, {unzip-2-in_len}
 a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24
 b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32
 validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32
@@ -7322,7 +7322,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
 
 /// Unzip vectors
 name = vuzp2
-multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle!, a, b, {unzip-2-in_len}
 a = 0., 8., 1., 9., 4., 12., 5., 13.
 b = 2., 9., 3., 11., 6., 14., 7., 15.
 validate 8., 9., 9., 11., 12., 13., 14., 15.
@@ -7353,8 +7353,8 @@ generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = vabd_u8, d, e, f:uint8x8_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 9, 10, 11, 12, 13, 14, 15, 16
@@ -7368,8 +7368,8 @@ generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
-multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, e:uint16x4_t, c, c, [4, 5, 6, 7]
 multi_fn = vabd_u16, d, e, f:uint16x4_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 9, 10, 11, 12
@@ -7383,8 +7383,8 @@ generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
-multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3]
+multi_fn = simd_shuffle!, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, e:uint32x2_t, c, c, [2, 3]
 multi_fn = vabd_u32, d, e, f:uint32x2_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 15, 16
@@ -7444,8 +7444,8 @@ generate int64x2_t:int32x2_t:int32x2_t:int64x2_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = vabd_s8, d, e, f:int8x8_t
 multi_fn = simd_cast, f:uint8x8_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@@ -7460,8 +7460,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
-multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle!, e:int16x4_t, c, c, [4, 5, 6, 7]
 multi_fn = vabd_s16, d, e, f:int16x4_t
 multi_fn = simd_cast, f:uint16x4_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@@ -7476,8 +7476,8 @@ generate int32x4_t:int16x8_t:int16x8_t:int32x4_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
-multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3]
+multi_fn = simd_shuffle!, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle!, e:int32x2_t, c, c, [2, 3]
 multi_fn = vabd_s32, d, e, f:int32x2_t
 multi_fn = simd_cast, f:uint32x2_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@@ -7490,10 +7490,10 @@ aarch64 = sabal
 generate int64x2_t:int32x4_t:int32x4_t:int64x2_t
 
 ////////////////////
-// Singned saturating Absolute value
+// Signed saturating Absolute value
 ////////////////////
 
-/// Singned saturating Absolute value
+/// Signed saturating Absolute value
 name = vqabs
 a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5
 validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5
@@ -7504,7 +7504,7 @@ link-arm = vqabs._EXT_
 link-aarch64 = sqabs._EXT_
 generate int*_t
 
-/// Singned saturating Absolute value
+/// Signed saturating Absolute value
 name = vqabs
 a = MIN, -7
 validate MAX, 7
diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs
index 750e88091..652aee88c 100644
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@@ -1,6 +1,7 @@
 use self::Suffix::*;
 use self::TargetFeature::*;
 use std::env;
+use std::fmt;
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::{self, BufReader};
@@ -470,6 +471,199 @@ enum TargetFeature {
     FTTS,
 }
 
+impl TargetFeature {
+    /// A string for use with `#[target_feature(...)]`.
+    fn as_target_feature_arg_aarch64(&self) -> &str {
+        match *self {
+            // Features included with AArch64 NEON.
+            Self::Default => "neon",
+            Self::ArmV7 => "neon",
+            Self::Vfp4 => "neon",
+            Self::FPArmV8 => "neon",
+            // Optional features.
+            Self::AES => "neon,aes",
+            Self::FCMA => "neon,fcma",
+            Self::Dotprod => "neon,dotprod",
+            Self::I8MM => "neon,i8mm",
+            Self::SHA3 => "neon,sha3",
+            Self::RDM => "rdm",
+            Self::SM4 => "neon,sm4",
+            Self::FTTS => "neon,frintts",
+        }
+    }
+
+    /// A string for use with #[simd_test(...)] (or `is_aarch64_feature_detected!(...)`).
+    fn as_simd_test_arg_aarch64(&self) -> &str {
+        self.as_target_feature_arg_aarch64()
+    }
+
+    /// A string for use with `#[target_feature(...)]`.
+    fn as_target_feature_arg_arm(&self) -> &str {
+        match *self {
+            Self::Default => "neon,v7",
+            Self::ArmV7 => "neon,v7",
+            Self::Vfp4 => "neon,vfp4",
+            Self::FPArmV8 => "neon,fp-armv8,v8",
+            Self::AES => "neon,v8,aes",
+            Self::FCMA => "neon,v8,fcma",
+            Self::Dotprod => "neon,v8,dotprod",
+            Self::I8MM => "neon,v8,i8mm",
+            // Features not supported on 32-bit "arm".
+            Self::SHA3 => unimplemented!(),
+            Self::RDM => unimplemented!(),
+            Self::SM4 => unimplemented!(),
+            Self::FTTS => unimplemented!(),
+        }
+    }
+
+    /// A string for use with #[simd_test(...)] (or `is_arm_feature_detected!(...)`).
+    fn as_simd_test_arg_arm(&self) -> &str {
+        // TODO: Ideally, these would match the target_feature strings (as for AArch64).
+        match *self {
+            // We typically specify the "v7" or "v8" target_features for codegen, but we can't test
+            // them at runtime. However, in many cases we can test a specific named feature, and
+            // this is sufficient. For example, Neon always requires at least Armv7.
+
+            // "v7" extensions.
+            Self::Default => "neon",
+            Self::ArmV7 => "neon",
+
+            // TODO: We can specify these features for code generation, but they have no runtime
+            // detection, so we can't provide an accurate string for simd_test. For now, we use a
+            // common Armv8 feature as a proxy, but we should improve std_detect support here and
+            // update these accordingly.
+            Self::Vfp4 => "neon,crc",
+            Self::FPArmV8 => "neon,crc",
+
+            // "v8" extensions.
+            Self::AES => "neon,aes",
+            Self::FCMA => "neon,fcma",
+            Self::Dotprod => "neon,dotprod",
+            Self::I8MM => "neon,i8mm",
+
+            // Features not supported on 32-bit "arm".
+            Self::SHA3 => unimplemented!(),
+            Self::RDM => unimplemented!(),
+            Self::SM4 => unimplemented!(),
+            Self::FTTS => unimplemented!(),
+        }
+    }
+
+    fn attr(name: &str, value: impl fmt::Display) -> String {
+        format!(r#"#[{name}(enable = "{value}")]"#)
+    }
+
+    fn attr_for_arch(arch: &str, name: &str, value: impl fmt::Display) -> String {
+        format!(r#"#[cfg_attr(target_arch = "{arch}", {name}(enable = "{value}"))]"#)
+    }
+
+    /// Generate target_feature attributes for a test that will compile for both "arm" and "aarch64".
+    fn to_target_feature_attr_shared(&self) -> Lines {
+        let arm = self.as_target_feature_arg_arm().split(",");
+        let aarch64 = self.as_target_feature_arg_aarch64().split(",");
+
+        // Combine common features into an unconditional `target_feature` annotation, but guard
+        // others behind `cfg_attr`.
+        // TODO: It's much simpler to emit separate, guarded attributes for each architecture (as
+        // for `simd_test`). However, this has an unfortunate impact on documentation, since
+        // rustdoc can't currently look inside `cfg_attr` (stdarch/issues/1268).
+        let mut aarch64: Vec<_> = aarch64.collect();
+        let (both, arm): (Vec<_>, Vec<_>) = arm.partition(|v| aarch64.contains(v));
+        aarch64.retain(|v| !both.contains(v));
+        let mut lines = Vec::new();
+        if !both.is_empty() {
+            lines.push(Self::attr("target_feature", both.join(",")));
+        };
+        if !arm.is_empty() {
+            lines.push(Self::attr_for_arch("arm", "target_feature", arm.join(",")));
+        }
+        if !aarch64.is_empty() {
+            lines.push(Self::attr_for_arch(
+                "aarch64",
+                "target_feature",
+                aarch64.join(","),
+            ));
+        }
+        lines.into()
+    }
+
+    /// Generate a target_feature attribute for a test that will compile only for "aarch64".
+    fn to_target_feature_attr_aarch64(&self) -> Lines {
+        Lines::single(Self::attr(
+            "target_feature",
+            self.as_target_feature_arg_aarch64(),
+        ))
+    }
+
+    /// Generate a target_feature attribute for a test that will compile only for "arm".
+    fn to_target_feature_attr_arm(&self) -> Lines {
+        Lines::single(Self::attr(
+            "target_feature",
+            self.as_target_feature_arg_arm(),
+        ))
+    }
+
+    /// Generate simd_test attributes for a test that will compile for both "arm" and "aarch64".
+    fn to_simd_test_attr_shared(&self) -> Lines {
+        let arm = self.as_simd_test_arg_arm();
+        let aarch64 = self.as_simd_test_arg_aarch64();
+        if arm == aarch64 {
+            Lines::single(Self::attr("simd_test", arm))
+        } else {
+            vec![
+                Self::attr_for_arch("arm", "simd_test", arm),
+                Self::attr_for_arch("aarch64", "simd_test", aarch64),
+            ]
+            .into()
+        }
+    }
+
+    /// Generate a simd_test attribute for a test that will compile only for "aarch64".
+    fn to_simd_test_attr_aarch64(&self) -> Lines {
+        Lines::single(Self::attr("simd_test", self.as_simd_test_arg_aarch64()))
+    }
+}
+
+/// Complete lines of generated source.
+///
+/// This enables common generation tasks to be factored out without precluding basic
+/// context-specific formatting.
+///
+/// The convention in this generator is to prefix (not suffix) lines with a newline, so the
+/// implementation of `std::fmt::Display` behaves in the same way.
+struct Lines {
+    indent: usize,
+    lines: Vec<String>,
+}
+
+impl Lines {
+    fn indented(self, indent: usize) -> Self {
+        Self {
+            indent: indent + self.indent,
+            ..self
+        }
+    }
+
+    fn single(line: String) -> Self {
+        Self::from(vec![line])
+    }
+}
+
+impl From<Vec<String>> for Lines {
+    fn from(lines: Vec<String>) -> Self {
+        Self { indent: 0, lines }
+    }
+}
+
+impl std::fmt::Display for Lines {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> fmt::Result {
+        for line in self.lines.iter() {
+            write!(f, "\n{:width$}{line}", "", width = self.indent)?;
+        }
+        Ok(())
+    }
+}
+
 #[derive(Clone, Copy)]
 enum Fntype {
     Normal,
@@ -1106,20 +1300,6 @@ fn gen_aarch64(
         Rot => type_to_rot_suffix(current_name, type_to_suffix(out_t)),
         RotLane => type_to_rot_suffix(current_name, &type_to_lane_suffixes(out_t, in_t[2], false)),
     };
-    let current_target = match target {
-        Default => "neon",
-        ArmV7 => "neon",
-        Vfp4 => "neon",
-        FPArmV8 => "neon",
-        AES => "neon,aes",
-        FCMA => "neon,fcma",
-        Dotprod => "neon,dotprod",
-        I8MM => "neon,i8mm",
-        SHA3 => "neon,sha3",
-        RDM => "rdm",
-        SM4 => "neon,sm4",
-        FTTS => "neon,frintts",
-    };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() {
             panic!("[{name}] Can't specify link and fn at the same time.")
@@ -1267,7 +1447,6 @@ fn gen_aarch64(
             calls.push_str(&get_call(
                 &multi_fn[i],
                 current_name,
-                &const_declare,
                 in_t,
                 out_t,
                 fixed,
@@ -1415,33 +1594,18 @@ fn gen_aarch64(
         RDM => String::from("\n#[stable(feature = \"rdm_intrinsics\", since = \"1.62.0\")]"),
         _ => String::new(),
     };
-    let function_doc = create_doc_string(current_comment, &name);
     let function = format!(
         r#"
-{}
-#[inline]
-#[target_feature(enable = "{}")]
-#[cfg_attr(test, assert_instr({}{}))]{}{}
-{}{{
-    {}
+{function_doc}
+#[inline]{target_feature}
+#[cfg_attr(test, assert_instr({current_aarch64}{const_assert}))]{const_legacy}{stable}
+{fn_decl}{{
+    {call_params}
 }}
 "#,
-        function_doc,
-        current_target,
-        current_aarch64,
-        const_assert,
-        const_legacy,
-        stable,
-        fn_decl,
-        call_params
+        function_doc = create_doc_string(current_comment, &name),
+        target_feature = target.to_target_feature_attr_aarch64()
     );
-    let test_target = match target {
-        I8MM => "neon,i8mm",
-        SM4 => "neon,sm4",
-        SHA3 => "neon,sha3",
-        FTTS => "neon,frintts",
-        _ => "neon",
-    };
     let test = match fn_type {
         Fntype::Normal => gen_test(
             &name,
@@ -1451,7 +1615,7 @@ fn gen_aarch64(
             [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
             type_len(out_t),
             para_num,
-            test_target,
+            target.to_simd_test_attr_aarch64(),
         ),
         Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
         Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
@@ -1473,10 +1637,9 @@ fn gen_load_test(
     type_len: usize,
 ) -> String {
     let mut test = format!(
-        r#"
-    #[simd_test(enable = "neon")]
-    unsafe fn test_{}() {{"#,
-        name,
+        r#"{simd_test}
+    unsafe fn test_{name}() {{"#,
+        simd_test = Default.to_simd_test_attr_shared().indented(4)
     );
     for (a, b, _, n, e) in current_tests {
         let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
@@ -1571,10 +1734,9 @@ fn gen_store_test(
     type_len: usize,
 ) -> String {
     let mut test = format!(
-        r#"
-    #[simd_test(enable = "neon")]
-    unsafe fn test_{}() {{"#,
-        name,
+        r#"{simd_test}
+    unsafe fn test_{name}() {{"#,
+        simd_test = Default.to_simd_test_attr_shared().indented(4)
     );
     for (a, _, _, constn, e) in current_tests {
         let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
@@ -1639,14 +1801,10 @@ fn gen_test(
     len_in: [usize; 3],
     len_out: usize,
     para_num: i32,
-    target: &str,
+    attributes: Lines,
 ) -> String {
-    let mut test = format!(
-        r#"
-    #[simd_test(enable = "{}")]
-    unsafe fn test_{}() {{"#,
-        target, name,
-    );
+    let mut test = attributes.indented(4).to_string();
+    test.push_str(&format!("\n    unsafe fn test_{name}() {{"));
     for (a, b, c, n, e) in current_tests {
         let a: Vec<String> = a.iter().take(len_in[0]).cloned().collect();
         let b: Vec<String> = b.iter().take(len_in[1]).cloned().collect();
@@ -1833,34 +1991,6 @@ fn gen_arm(
     let current_aarch64 = current_aarch64
         .clone()
         .unwrap_or_else(|| current_arm.to_string());
-    let current_target_aarch64 = match target {
-        Default => "neon",
-        ArmV7 => "neon",
-        Vfp4 => "neon",
-        FPArmV8 => "neon",
-        AES => "neon,aes",
-        FCMA => "neon,fcma",
-        Dotprod => "neon,dotprod",
-        I8MM => "neon,i8mm",
-        SHA3 => "neon,sha3",
-        RDM => "rdm",
-        SM4 => "neon,sm4",
-        FTTS => "neon,frintts",
-    };
-    let current_target_arm = match target {
-        Default => "v7",
-        ArmV7 => "v7",
-        Vfp4 => "vfp4",
-        FPArmV8 => "fp-armv8,v8",
-        AES => "aes,v8",
-        FCMA => "v8",    // v8.3a
-        Dotprod => "v8", // v8.2a
-        I8MM => "v8,i8mm",
-        RDM => unreachable!(),
-        SM4 => unreachable!(),
-        SHA3 => unreachable!(),
-        FTTS => unreachable!(),
-    };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() || link_arm.is_some() {
             panic!(
@@ -2182,7 +2312,6 @@ fn gen_arm(
             calls.push_str(&get_call(
                 &multi_fn[i],
                 current_name,
-                &const_declare,
                 in_t,
                 out_t,
                 fixed,
@@ -2378,33 +2507,22 @@ fn gen_arm(
         let function_doc = create_doc_string(current_comment, &name);
         format!(
             r#"
-{}
+{function_doc}
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,{}")]
-#[cfg_attr(test, assert_instr({}{}))]{}
-{}
+#[cfg(target_arch = "arm")]{target_feature_arm}
+#[cfg_attr(test, assert_instr({assert_arm}{const_assert}))]{const_legacy}
+{call_arm}
 
-{}
+{function_doc}
 #[inline]
-#[cfg(target_arch = "aarch64")]
-#[target_feature(enable = "{}")]
-#[cfg_attr(test, assert_instr({}{}))]{}{}
-{}
+#[cfg(target_arch = "aarch64")]{target_feature_aarch64}
+#[cfg_attr(test, assert_instr({assert_aarch64}{const_assert}))]{const_legacy}{stable_aarch64}
+{call_aarch64}
 "#,
-            function_doc,
-            current_target_arm,
-            expand_intrinsic(&current_arm, in_t[1]),
-            const_assert,
-            const_legacy,
-            call_arm,
-            function_doc,
-            current_target_aarch64,
-            expand_intrinsic(&current_aarch64, in_t[1]),
-            const_assert,
-            const_legacy,
-            stable_aarch64,
-            call_aarch64,
+            target_feature_arm = target.to_target_feature_attr_arm(),
+            target_feature_aarch64 = target.to_target_feature_attr_aarch64(),
+            assert_arm = expand_intrinsic(&current_arm, in_t[1]),
+            assert_aarch64 = expand_intrinsic(&current_aarch64, in_t[1]),
         )
     } else {
         let call = {
@@ -2444,36 +2562,20 @@ fn gen_arm(
             RDM => String::from("\n#[cfg_attr(not(target_arch = \"arm\"), stable(feature = \"rdm_intrinsics\", since = \"1.62.0\"))]"),
             _ => String::new(),
         };
-        let function_doc = create_doc_string(current_comment, &name);
         format!(
             r#"
-{}
-#[inline]
-#[target_feature(enable = "{}")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "{}"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr({}{}))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({}{}))]{}{}
-{}
+{function_doc}
+#[inline]{target_feature}
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr({assert_arm}{const_assert}))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({assert_aarch64}{const_assert}))]{const_legacy}{stable_aarch64}
+{call}
 "#,
-            function_doc,
-            current_target_aarch64,
-            current_target_arm,
-            expand_intrinsic(&current_arm, in_t[1]),
-            const_assert,
-            expand_intrinsic(&current_aarch64, in_t[1]),
-            const_assert,
-            const_legacy,
-            stable_aarch64,
-            call,
+            function_doc = create_doc_string(current_comment, &name),
+            assert_arm = expand_intrinsic(&current_arm, in_t[1]),
+            assert_aarch64 = expand_intrinsic(&current_aarch64, in_t[1]),
+            target_feature = target.to_target_feature_attr_shared(),
         )
     };
-    let test_target = match target {
-        I8MM => "neon,i8mm",
-        SM4 => "neon,sm4",
-        SHA3 => "neon,sha3",
-        FTTS => "neon,frintts",
-        _ => "neon",
-    };
     let test = match fn_type {
         Fntype::Normal => gen_test(
             &name,
@@ -2483,7 +2585,7 @@ fn gen_arm(
             [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
             type_len(out_t),
             para_num,
-            test_target,
+            target.to_simd_test_attr_shared(),
         ),
         Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
         Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
@@ -2603,7 +2705,6 @@ fn expand_intrinsic(intr: &str, t: &str) -> String {
 fn get_call(
     in_str: &str,
     current_name: &str,
-    const_declare: &str,
     in_t: &[&str; 3],
     out_t: &str,
     fixed: &Vec<String>,
@@ -2643,7 +2744,7 @@ fn get_call(
             "halflen" => type_len(in_t[1]) / 2,
             _ => 0,
         };
-        let mut s = format!("{const_declare} [");
+        let mut s = format!("[");
         for i in 0..len {
             if i != 0 {
                 s.push_str(", ");
@@ -2674,7 +2775,7 @@ fn get_call(
     if fn_name.starts_with("base") {
         let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
         assert_eq!(fn_format.len(), 3);
-        let mut s = format!("<const {}: i32> [", &fn_format[2]);
+        let mut s = format!("[");
         let base_len = fn_format[1].parse::<usize>().unwrap();
         for i in 0..type_len(in_t[1]) / base_len {
             for j in 0..base_len {
@@ -2714,7 +2815,7 @@ fn get_call(
             "in0_len" => type_len(in_t[0]),
             _ => 0,
         };
-        let mut s = format!("{const_declare} [");
+        let mut s = format!("[");
         for i in 0..len {
             if i != 0 {
                 s.push_str(", ");
@@ -2743,12 +2844,9 @@ fn get_call(
             _ => 0,
         };
         if len == 0 {
-            return format!(
-                r#"static_assert!({} : i32 where {} == 0);"#,
-                fn_format[2], fn_format[2]
-            );
+            return format!(r#"static_assert!({} == 0);"#, fn_format[2]);
         } else {
-            return format!(r#"static_assert_imm{len}!({});"#, fn_format[2]);
+            return format!(r#"static_assert_uimm_bits!({}, {len});"#, fn_format[2]);
         }
     }
     if fn_name.starts_with("static_assert") {
@@ -2768,14 +2866,11 @@ fn get_call(
             fn_format[3].clone()
         };
         if lim1 == lim2 {
-            return format!(
-                r#"static_assert!({} : i32 where {} == {lim1});"#,
-                fn_format[1], fn_format[1]
-            );
+            return format!(r#"static_assert!({} == {lim1});"#, fn_format[1]);
         } else {
             return format!(
-                r#"static_assert!({} : i32 where {} >= {lim1} && {} <= {lim2});"#,
-                fn_format[1], fn_format[1], fn_format[1]
+                r#"static_assert!({} >= {lim1} && {} <= {lim2});"#,
+                fn_format[1], fn_format[1]
             );
         }
     }
@@ -2824,7 +2919,6 @@ fn get_call(
                 get_call(
                     &sub_call,
                     current_name,
-                    const_declare,
                     in_t,
                     out_t,
                     fixed,
@@ -2873,7 +2967,6 @@ fn get_call(
             let sub_call = get_call(
                 &sub_fn[1..sub_fn.len() - 1],
                 current_name,
-                const_declare,
                 in_t,
                 out_t,
                 fixed,