summaryrefslogtreecommitdiffstats
path: root/library/stdarch/crates/stdarch-gen
diff options
context:
space:
mode:
Diffstat (limited to 'library/stdarch/crates/stdarch-gen')
-rw-r--r--library/stdarch/crates/stdarch-gen/neon.spec343
-rw-r--r--library/stdarch/crates/stdarch-gen/src/main.rs21
2 files changed, 325 insertions, 39 deletions
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec
index 06090e669..760fa2204 100644
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@@ -3478,27 +3478,138 @@ link-arm = vst4lane._EXTpi8r_
const-arm = LANE
generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+/// Dot product vector form with unsigned and signed integers
+name = vusdot
+out-suffix
+a = 1000, -4200, -1000, 2000
+b = 100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135
+c = 0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8
+aarch64 = usdot
+arm = vusdot
+target = i8mm
+
+// 1000 + (100, 205, 110, 195) . ( 0, 1, 2, 3)
+// -4200 + (120, 185, 130, 175) . (-1, -2, -3, -4)
+// ...
+validate 2010, -5780, 2370, -1940
+
+link-arm = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+
+link-arm = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+
+/// Dot product index form with unsigned and signed integers
+name = vusdot
+out-lane-suffixes
+constn = LANE
+aarch64 = usdot
+arm = vusdot
+target = i8mm
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, b, {transmute, c}
+a = 1000, -4200, -1000, 2000
+b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+
+// 1000 + (100, 110, 120, 130) . (4, 3, 2, 1)
+// -4200 + (140, 150, 160, 170) . (4, 3, 2, 1)
+// ...
+n = 0
+validate 2100, -2700, 900, 4300
+
+// 1000 + (100, 110, 120, 130) . (0, -1, -2, -3)
+// -4200 + (140, 150, 160, 170) . (0, -1, -2, -3)
+// ...
+n = 1
+validate 260, -5180, -2220, 540
+
+generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+generate int32x4_t:uint8x16_t:int8x8_t:int32x4_t
+
+/// Dot product index form with unsigned and signed integers
+name = vusdot
+out-lane-suffixes
+constn = LANE
+// Only AArch64 has the laneq forms.
+aarch64 = usdot
+target = i8mm
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, b, {transmute, c}
+a = 1000, -4200, -1000, 2000
+b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+
+// 1000 + (100, 110, 120, 130) . (-4, -5, -6, -7)
+// -4200 + (140, 150, 160, 170) . (-4, -5, -6, -7)
+// ...
+n = 3
+validate -3420, -10140, -8460, -6980
+
+generate int32x2_t:uint8x8_t:int8x16_t:int32x2_t
+generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+
/// Dot product index form with signed and unsigned integers
name = vsudot
out-lane-suffixes
constn = LANE
+aarch64 = sudot
+arm = vsudot
+target = i8mm
+
multi_fn = static_assert_imm-in2_dot-LANE
-multi_fn = simd_shuffle!, c:unsigned, c, c, {base-4-LANE}
-multi_fn = vsudot-outlane-_, a, b, c
-a = 1, 2, 1, 2
-b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
-c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, {transmute, c}, b
+a = -2000, 4200, -1000, 2000
+b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+
+// -2000 + (4, 3, 2, 1) . (100, 110, 120, 130)
+// 4200 + (0, -1, -2, -3) . (100, 110, 120, 130)
+// ...
n = 0
-validate 31, 72, 31, 72
-target = dotprod
+validate -900, 3460, -3580, -2420
+
+// -2000 + (4, 3, 2, 1) . (140, 150, 160, 170)
+// 4200 + (0, -1, -2, -3) . (140, 150, 160, 170)
+// ...
+n = 1
+validate -500, 3220, -4460, -3940
+generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t
+generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t
+
+/// Dot product index form with signed and unsigned integers
+name = vsudot
+out-lane-suffixes
+constn = LANE
+// Only AArch64 has the laneq forms.
aarch64 = sudot
-link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t
-// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
-//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t
-link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t
-// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
-//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t
+target = i8mm
+
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, {transmute, c}, b
+a = -2000, 4200, -1000, 2000
+b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+
+// -2000 + (4, 3, 2, 1) . (220, 230, 240, 250)
+// 4200 + (0, -1, -2, -3) . (220, 230, 240, 250)
+// ...
+n = 3
+validate 300, 2740, -6220, -6980
+
+generate int32x2_t:int8x8_t:uint8x16_t:int32x2_t
+generate int32x4_t:int8x16_t:uint8x16_t:int32x4_t
/// Multiply
name = vmul
@@ -4612,7 +4723,7 @@ aarch64 = fcmla
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
-/// Dot product arithmetic
+/// Dot product arithmetic (vector)
name = vdot
out-suffix
a = 1, 2, 1, 2
@@ -4621,35 +4732,65 @@ c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
validate 31, 176, 31, 176
target = dotprod
+arm = vsdot
aarch64 = sdot
+link-arm = sdot._EXT_._EXT3_
link-aarch64 = sdot._EXT_._EXT3_
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
+arm = vudot
aarch64 = udot
+link-arm = udot._EXT_._EXT3_
link-aarch64 = udot._EXT_._EXT3_
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
-/// Dot product arithmetic
+/// Dot product arithmetic (indexed)
name = vdot
out-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_dot-LANE
-multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE}
-multi_fn = vdot-out-noext, a, b, c
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vdot-out-noext, a, b, {transmute, c}
a = 1, 2, 1, 2
-b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
n = 0
-validate 31, 72, 31, 72
+validate 29, 72, 31, 72
target = dotprod
+// Only AArch64 has the laneq forms.
aarch64 = sdot
-generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t
-generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
+generate int32x2_t:int8x8_t:int8x16_t:int32x2_t
+generate int32x4_t:int8x16_t:int8x16_t:int32x4_t
+
+arm = vsdot
+generate int32x2_t:int8x8_t:int8x8_t:int32x2_t
+generate int32x4_t:int8x16_t:int8x8_t:int32x4_t
+
+/// Dot product arithmetic (indexed)
+name = vdot
+out-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vdot-out-noext, a, b, {transmute, c}
+a = 1, 2, 1, 2
+b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 285, 72, 31, 72
+target = dotprod
+// Only AArch64 has the laneq forms.
aarch64 = udot
-generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
-generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
+generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
+generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
+
+arm = vudot
+generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t
+generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t
/// Maximum (vector)
name = vmax
@@ -6511,7 +6652,7 @@ name = vrshr
n-suffix
constn = N
multi_fn = static_assert-N-1-bits
-multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _}
+multi_fn = vrshl-self-noext, a, {vdup-nself-noext, -N as _}
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
n = 2
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -6538,7 +6679,7 @@ name = vrshr
n-suffix
constn = N
multi_fn = static_assert-N-1-bits
-multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _}
+multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, -N as _}
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
n = 2
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -6650,10 +6791,10 @@ b = 4
n = 2
validate 2
-aarch64 = srsra
+aarch64 = srshr
generate i64
-/// Ungisned rounding shift right and accumulate.
+/// Unsigned rounding shift right and accumulate.
name = vrsra
n-suffix
constn = N
@@ -6665,7 +6806,7 @@ b = 4
n = 2
validate 2
-aarch64 = ursra
+aarch64 = urshr
generate u64
/// Rounding subtract returning high narrow
@@ -7071,44 +7212,170 @@ generate uint64x2_t
/// Floating-point round to 32-bit integer, using current rounding mode
name = vrnd32x
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 2.0, -2.0, -2.0
target = frintts
+// For validation, the rounding mode should be the default: round-to-nearest (ties-to-even).
+a = -1.5, 2.9, 1.5, -2.5
+validate -2.0, 3.0, 2.0, -2.0
+
aarch64 = frint32x
link-aarch64 = frint32x._EXT_
generate float32x2_t, float32x4_t
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 2.0, -2.0
+// - The biggest f64 that rounds to i32::MAX.
+// - The smallest positive f64 that rounds out of range.
+a = 2147483647.499999762, 2147483647.5
+validate 2147483647.0, -2147483648.0
+// - The smallest f64 that rounds to i32::MIN + 1.
+// - The largest negative f64 that rounds out of range.
+a = -2147483647.499999762, -2147483648.500000477
+validate -2147483647.0, -2147483648.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 3.0
+a = -2.5
+validate -2.0
+a = 2147483647.5
+validate -2147483648.0
+a = -2147483648.500000477
+validate -2147483648.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint32x.f64:f64:::f64
+generate float64x1_t
+
/// Floating-point round to 32-bit integer toward zero
name = vrnd32z
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 1.0, -1.0, -2.0
target = frintts
+a = -1.5, 2.9, 1.5, -2.5
+validate -1.0, 2.0, 1.0, -2.0
+
aarch64 = frint32z
link-aarch64 = frint32z._EXT_
generate float32x2_t, float32x4_t
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 1.0, -2.0
+// - The biggest f64 that rounds to i32::MAX.
+// - The smallest positive f64 that rounds out of range.
+a = 2147483647.999999762, 2147483648.0
+validate 2147483647.0, -2147483648.0
+// - The smallest f64 that rounds to i32::MIN + 1.
+// - The largest negative f64 that rounds out of range.
+a = -2147483647.999999762, -2147483649.0
+validate -2147483647.0, -2147483648.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 2.0
+a = -2.5
+validate -2.0
+a = 2147483648.0
+validate -2147483648.0
+a = -2147483649.0
+validate -2147483648.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint32z.f64:f64:::f64
+generate float64x1_t
+
/// Floating-point round to 64-bit integer, using current rounding mode
name = vrnd64x
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 2.0, -2.0, -2.0
target = frintts
+// For validation, the rounding mode should be the default: round-to-nearest (ties-to-even).
+a = -1.5, 2.9, 1.5, -2.5
+validate -2.0, 3.0, 2.0, -2.0
+
aarch64 = frint64x
link-aarch64 = frint64x._EXT_
generate float32x2_t, float32x4_t
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 2.0, -2.0
+// - The biggest f64 representable as an i64 (0x7ffffffffffffc00).
+// - The smallest positive f64 that is out of range (2^63).
+a = 9223372036854774784.0, 9223372036854775808.0
+validate 9223372036854774784.0, -9223372036854775808.0
+// - The smallest f64 representable as an i64 (i64::MIN).
+// - The biggest negative f64 that is out of range.
+a = -9223372036854775808.0, -9223372036854777856.0
+validate -9223372036854775808.0, -9223372036854775808.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 3.0
+a = -2.5
+validate -2.0
+a = 9223372036854775808.0
+validate -9223372036854775808.0
+a = -9223372036854777856.0
+validate -9223372036854775808.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint64x.f64:f64:::f64
+generate float64x1_t
+
/// Floating-point round to 64-bit integer toward zero
name = vrnd64z
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 1.0, -1.0, -2.0
target = frintts
+a = -1.5, 2.9, 1.5, -2.5
+validate -1.0, 2.0, 1.0, -2.0
+
aarch64 = frint64z
link-aarch64 = frint64z._EXT_
generate float32x2_t, float32x4_t
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 1.0, -2.0
+// - The biggest f64 representable as an i64 (0x7ffffffffffffc00).
+// - The smallest positive f64 that is out of range (2^63).
+a = 9223372036854774784.0, 9223372036854775808.0
+validate 9223372036854774784.0, -9223372036854775808.0
+// - The smallest f64 representable as an i64 (i64::MIN).
+// - The biggest negative f64 that is out of range.
+a = -9223372036854775808.0, -9223372036854777856.0
+validate -9223372036854775808.0, -9223372036854775808.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 2.0
+a = -2.5
+validate -2.0
+a = 9223372036854775808.0
+validate -9223372036854775808.0
+a = -9223372036854777856.0
+validate -9223372036854775808.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint64z.f64:f64:::f64
+generate float64x1_t
+
/// Transpose elements
name = vtrn
multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len}
@@ -7209,7 +7476,7 @@ generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t
generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t
arm = vtrn
generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
-aarch64 = ext
+aarch64 = zip
arm = vorr
generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t
generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t
@@ -7227,7 +7494,7 @@ validate 1., 5., 2., 6., 3., 7., 4., 8.
aarch64 = zip
arm = vtrn
generate float32x2_t:float32x2_t:float32x2x2_t
-aarch64 = ext
+aarch64 = zip
arm = vorr
generate float32x4_t:float32x4_t:float32x4x2_t
diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs
index 652aee88c..8e2bea0e2 100644
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@@ -799,6 +799,19 @@ fn type_to_half(t: &str) -> &str {
}
}
+fn type_with_merged_lanes(t: &str, elements_per_lane: usize) -> String {
+ assert_eq!(type_len(t) % elements_per_lane, 0);
+ let prefix_len = t
+ .find(|c: char| c.is_ascii_digit())
+ .unwrap_or_else(|| t.len());
+ format!(
+ "{prefix}{bits}x{len}_t",
+ prefix = &t[0..prefix_len],
+ bits = type_bits(t) * elements_per_lane,
+ len = type_len(t) / elements_per_lane
+ )
+}
+
fn asc(start: i32, len: usize) -> String {
let mut s = String::from("[");
for i in 0..len {
@@ -2515,7 +2528,7 @@ fn gen_arm(
{function_doc}
#[inline]
-#[cfg(target_arch = "aarch64")]{target_feature_aarch64}
+#[cfg(not(target_arch = "arm"))]{target_feature_aarch64}
#[cfg_attr(test, assert_instr({assert_aarch64}{const_assert}))]{const_legacy}{stable_aarch64}
{call_aarch64}
"#,
@@ -2993,6 +3006,12 @@ fn get_call(
re = Some((re_params[0].clone(), in_t[1].to_string()));
} else if re_params[1] == "out_t" {
re = Some((re_params[0].clone(), out_t.to_string()));
+ } else if re_params[1] == "out_unsigned" {
+ re = Some((re_params[0].clone(), type_to_unsigned(out_t).to_string()));
+ } else if re_params[1] == "out_signed" {
+ re = Some((re_params[0].clone(), type_to_signed(out_t).to_string()));
+ } else if re_params[1] == "merge4_t2" {
+ re = Some((re_params[0].clone(), type_with_merged_lanes(in_t[2], 4)));
} else if re_params[1] == "half" {
re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string()));
} else if re_params[1] == "in_ntt" {