diff options
Diffstat (limited to 'library/stdarch/crates/stdarch-gen')
-rw-r--r-- | library/stdarch/crates/stdarch-gen/neon.spec | 343 | ||||
-rw-r--r-- | library/stdarch/crates/stdarch-gen/src/main.rs | 21 |
2 files changed, 325 insertions, 39 deletions
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec index 06090e669..760fa2204 100644 --- a/library/stdarch/crates/stdarch-gen/neon.spec +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -3478,27 +3478,138 @@ link-arm = vst4lane._EXTpi8r_ const-arm = LANE generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void +/// Dot product vector form with unsigned and signed integers +name = vusdot +out-suffix +a = 1000, -4200, -1000, 2000 +b = 100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135 +c = 0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8 +aarch64 = usdot +arm = vusdot +target = i8mm + +// 1000 + (100, 205, 110, 195) . ( 0, 1, 2, 3) +// -4200 + (120, 185, 130, 175) . (-1, -2, -3, -4) +// ... +validate 2010, -5780, 2370, -1940 + +link-arm = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t +link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t +generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t + +link-arm = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t +link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t +generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t + +/// Dot product index form with unsigned and signed integers +name = vusdot +out-lane-suffixes +constn = LANE +aarch64 = usdot +arm = vusdot +target = i8mm +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, b, {transmute, c} +a = 1000, -4200, -1000, 2000 +b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 +c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 + +// 1000 + (100, 110, 120, 130) . (4, 3, 2, 1) +// -4200 + (140, 150, 160, 170) . (4, 3, 2, 1) +// ... +n = 0 +validate 2100, -2700, 900, 4300 + +// 1000 + (100, 110, 120, 130) . (0, -1, -2, -3) +// -4200 + (140, 150, 160, 170) . (0, -1, -2, -3) +// ... +n = 1 +validate 260, -5180, -2220, 540 + +generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t +generate int32x4_t:uint8x16_t:int8x8_t:int32x4_t + +/// Dot product index form with unsigned and signed integers +name = vusdot +out-lane-suffixes +constn = LANE +// Only AArch64 has the laneq forms. +aarch64 = usdot +target = i8mm +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, b, {transmute, c} +a = 1000, -4200, -1000, 2000 +b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 +c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 + +// 1000 + (100, 110, 120, 130) . (-4, -5, -6, -7) +// -4200 + (140, 150, 160, 170) . (-4, -5, -6, -7) +// ... +n = 3 +validate -3420, -10140, -8460, -6980 + +generate int32x2_t:uint8x8_t:int8x16_t:int32x2_t +generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t + /// Dot product index form with signed and unsigned integers name = vsudot out-lane-suffixes constn = LANE +aarch64 = sudot +arm = vsudot +target = i8mm + multi_fn = static_assert_imm-in2_dot-LANE -multi_fn = simd_shuffle!, c:unsigned, c, c, {base-4-LANE} -multi_fn = vsudot-outlane-_, a, b, c -a = 1, 2, 1, 2 -b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 -c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, {transmute, c}, b +a = -2000, 4200, -1000, 2000 +b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 +c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 + +// -2000 + (4, 3, 2, 1) . (100, 110, 120, 130) +// 4200 + (0, -1, -2, -3) . (100, 110, 120, 130) +// ... n = 0 -validate 31, 72, 31, 72 -target = dotprod +validate -900, 3460, -3580, -2420 + +// -2000 + (4, 3, 2, 1) . (140, 150, 160, 170) +// 4200 + (0, -1, -2, -3) . (140, 150, 160, 170) +// ... +n = 1 +validate -500, 3220, -4460, -3940 +generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t +generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t + +/// Dot product index form with signed and unsigned integers +name = vsudot +out-lane-suffixes +constn = LANE +// Only AArch64 has the laneq forms. aarch64 = sudot -link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t -// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot -//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t -link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t -// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot -//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t +target = i8mm + +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, {transmute, c}, b +a = -2000, 4200, -1000, 2000 +b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 +c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 + +// -2000 + (4, 3, 2, 1) . (220, 230, 240, 250) +// 4200 + (0, -1, -2, -3) . (220, 230, 240, 250) +// ... +n = 3 +validate 300, 2740, -6220, -6980 + +generate int32x2_t:int8x8_t:uint8x16_t:int32x2_t +generate int32x4_t:int8x16_t:uint8x16_t:int32x4_t /// Multiply name = vmul @@ -4612,7 +4723,7 @@ aarch64 = fcmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t -/// Dot product arithmetic +/// Dot product arithmetic (vector) name = vdot out-suffix a = 1, 2, 1, 2 @@ -4621,35 +4732,65 @@ c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 validate 31, 176, 31, 176 target = dotprod +arm = vsdot aarch64 = sdot +link-arm = sdot._EXT_._EXT3_ link-aarch64 = sdot._EXT_._EXT3_ generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t +arm = vudot aarch64 = udot +link-arm = udot._EXT_._EXT3_ link-aarch64 = udot._EXT_._EXT3_ generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t -/// Dot product arithmetic +/// Dot product arithmetic (indexed) name = vdot out-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_dot-LANE -multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE} -multi_fn = vdot-out-noext, a, b, c +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} +multi_fn = vdot-out-noext, a, b, {transmute, c} a = 1, 2, 1, 2 -b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 n = 0 -validate 31, 72, 31, 72 +validate 29, 72, 31, 72 target = dotprod +// Only AArch64 has the laneq forms. aarch64 = sdot -generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t -generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t +generate int32x2_t:int8x8_t:int8x16_t:int32x2_t +generate int32x4_t:int8x16_t:int8x16_t:int32x4_t + +arm = vsdot +generate int32x2_t:int8x8_t:int8x8_t:int32x2_t +generate int32x4_t:int8x16_t:int8x8_t:int32x4_t + +/// Dot product arithmetic (indexed) +name = vdot +out-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} +multi_fn = vdot-out-noext, a, b, {transmute, c} +a = 1, 2, 1, 2 +b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +n = 0 +validate 285, 72, 31, 72 +target = dotprod +// Only AArch64 has the laneq forms. aarch64 = udot -generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t -generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t +generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t +generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t + +arm = vudot +generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t +generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t /// Maximum (vector) name = vmax @@ -6511,7 +6652,7 @@ name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits -multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _} +multi_fn = vrshl-self-noext, a, {vdup-nself-noext, -N as _} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 @@ -6538,7 +6679,7 @@ name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits -multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _} +multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, -N as _} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 @@ -6650,10 +6791,10 @@ b = 4 n = 2 validate 2 -aarch64 = srsra +aarch64 = srshr generate i64 -/// Ungisned rounding shift right and accumulate. +/// Unsigned rounding shift right and accumulate. name = vrsra n-suffix constn = N @@ -6665,7 +6806,7 @@ b = 4 n = 2 validate 2 -aarch64 = ursra +aarch64 = urshr generate u64 /// Rounding subtract returning high narrow @@ -7071,44 +7212,170 @@ generate uint64x2_t /// Floating-point round to 32-bit integer, using current rounding mode name = vrnd32x -a = 1.1, 1.9, -1.7, -2.3 -validate 1.0, 2.0, -2.0, -2.0 target = frintts +// For validation, the rounding mode should be the default: round-to-nearest (ties-to-even). +a = -1.5, 2.9, 1.5, -2.5 +validate -2.0, 3.0, 2.0, -2.0 + aarch64 = frint32x link-aarch64 = frint32x._EXT_ generate float32x2_t, float32x4_t +// The float64x1_t form uses a different LLVM link and isn't supported by Clang +// (and so has no intrinsic-test), so perform extra validation to make sure +// that it matches the float64x2_t form. + +a = 1.5, -2.5 +validate 2.0, -2.0 +// - The biggest f64 that rounds to i32::MAX. +// - The smallest positive f64 that rounds out of range. +a = 2147483647.499999762, 2147483647.5 +validate 2147483647.0, -2147483648.0 +// - The smallest f64 that rounds to i32::MIN + 1. +// - The largest negative f64 that rounds out of range. +a = -2147483647.499999762, -2147483648.500000477 +validate -2147483647.0, -2147483648.0 +generate float64x2_t + +// Odd-numbered tests for float64x1_t coverage. +a = 2.9 +validate 3.0 +a = -2.5 +validate -2.0 +a = 2147483647.5 +validate -2147483648.0 +a = -2147483648.500000477 +validate -2147483648.0 + +multi_fn = transmute, {self-out-_, {simd_extract, a, 0}} +link-aarch64 = llvm.aarch64.frint32x.f64:f64:::f64 +generate float64x1_t + /// Floating-point round to 32-bit integer toward zero name = vrnd32z -a = 1.1, 1.9, -1.7, -2.3 -validate 1.0, 1.0, -1.0, -2.0 target = frintts +a = -1.5, 2.9, 1.5, -2.5 +validate -1.0, 2.0, 1.0, -2.0 + aarch64 = frint32z link-aarch64 = frint32z._EXT_ generate float32x2_t, float32x4_t +// The float64x1_t form uses a different LLVM link and isn't supported by Clang +// (and so has no intrinsic-test), so perform extra validation to make sure +// that it matches the float64x2_t form. + +a = 1.5, -2.5 +validate 1.0, -2.0 +// - The biggest f64 that rounds to i32::MAX. +// - The smallest positive f64 that rounds out of range. +a = 2147483647.999999762, 2147483648.0 +validate 2147483647.0, -2147483648.0 +// - The smallest f64 that rounds to i32::MIN + 1. +// - The largest negative f64 that rounds out of range. +a = -2147483647.999999762, -2147483649.0 +validate -2147483647.0, -2147483648.0 +generate float64x2_t + +// Odd-numbered tests for float64x1_t coverage. +a = 2.9 +validate 2.0 +a = -2.5 +validate -2.0 +a = 2147483648.0 +validate -2147483648.0 +a = -2147483649.0 +validate -2147483648.0 + +multi_fn = transmute, {self-out-_, {simd_extract, a, 0}} +link-aarch64 = llvm.aarch64.frint32z.f64:f64:::f64 +generate float64x1_t + /// Floating-point round to 64-bit integer, using current rounding mode name = vrnd64x -a = 1.1, 1.9, -1.7, -2.3 -validate 1.0, 2.0, -2.0, -2.0 target = frintts +// For validation, the rounding mode should be the default: round-to-nearest (ties-to-even). +a = -1.5, 2.9, 1.5, -2.5 +validate -2.0, 3.0, 2.0, -2.0 + aarch64 = frint64x link-aarch64 = frint64x._EXT_ generate float32x2_t, float32x4_t +// The float64x1_t form uses a different LLVM link and isn't supported by Clang +// (and so has no intrinsic-test), so perform extra validation to make sure +// that it matches the float64x2_t form. + +a = 1.5, -2.5 +validate 2.0, -2.0 +// - The biggest f64 representable as an i64 (0x7ffffffffffffc00). +// - The smallest positive f64 that is out of range (2^63). +a = 9223372036854774784.0, 9223372036854775808.0 +validate 9223372036854774784.0, -9223372036854775808.0 +// - The smallest f64 representable as an i64 (i64::MIN). +// - The biggest negative f64 that is out of range. +a = -9223372036854775808.0, -9223372036854777856.0 +validate -9223372036854775808.0, -9223372036854775808.0 +generate float64x2_t + +// Odd-numbered tests for float64x1_t coverage. +a = 2.9 +validate 3.0 +a = -2.5 +validate -2.0 +a = 9223372036854775808.0 +validate -9223372036854775808.0 +a = -9223372036854777856.0 +validate -9223372036854775808.0 + +multi_fn = transmute, {self-out-_, {simd_extract, a, 0}} +link-aarch64 = llvm.aarch64.frint64x.f64:f64:::f64 +generate float64x1_t + /// Floating-point round to 64-bit integer toward zero name = vrnd64z -a = 1.1, 1.9, -1.7, -2.3 -validate 1.0, 1.0, -1.0, -2.0 target = frintts +a = -1.5, 2.9, 1.5, -2.5 +validate -1.0, 2.0, 1.0, -2.0 + aarch64 = frint64z link-aarch64 = frint64z._EXT_ generate float32x2_t, float32x4_t +// The float64x1_t form uses a different LLVM link and isn't supported by Clang +// (and so has no intrinsic-test), so perform extra validation to make sure +// that it matches the float64x2_t form. + +a = 1.5, -2.5 +validate 1.0, -2.0 +// - The biggest f64 representable as an i64 (0x7ffffffffffffc00). +// - The smallest positive f64 that is out of range (2^63). +a = 9223372036854774784.0, 9223372036854775808.0 +validate 9223372036854774784.0, -9223372036854775808.0 +// - The smallest f64 representable as an i64 (i64::MIN). +// - The biggest negative f64 that is out of range. +a = -9223372036854775808.0, -9223372036854777856.0 +validate -9223372036854775808.0, -9223372036854775808.0 +generate float64x2_t + +// Odd-numbered tests for float64x1_t coverage. +a = 2.9 +validate 2.0 +a = -2.5 +validate -2.0 +a = 9223372036854775808.0 +validate -9223372036854775808.0 +a = -9223372036854777856.0 +validate -9223372036854775808.0 + +multi_fn = transmute, {self-out-_, {simd_extract, a, 0}} +link-aarch64 = llvm.aarch64.frint64z.f64:f64:::f64 +generate float64x1_t + /// Transpose elements name = vtrn multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len} @@ -7209,7 +7476,7 @@ generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t arm = vtrn generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t -aarch64 = ext +aarch64 = zip arm = vorr generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t @@ -7227,7 +7494,7 @@ validate 1., 5., 2., 6., 3., 7., 4., 8. aarch64 = zip arm = vtrn generate float32x2_t:float32x2_t:float32x2x2_t -aarch64 = ext +aarch64 = zip arm = vorr generate float32x4_t:float32x4_t:float32x4x2_t diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs index 652aee88c..8e2bea0e2 100644 --- a/library/stdarch/crates/stdarch-gen/src/main.rs +++ b/library/stdarch/crates/stdarch-gen/src/main.rs @@ -799,6 +799,19 @@ fn type_to_half(t: &str) -> &str { } } +fn type_with_merged_lanes(t: &str, elements_per_lane: usize) -> String { + assert_eq!(type_len(t) % elements_per_lane, 0); + let prefix_len = t + .find(|c: char| c.is_ascii_digit()) + .unwrap_or_else(|| t.len()); + format!( + "{prefix}{bits}x{len}_t", + prefix = &t[0..prefix_len], + bits = type_bits(t) * elements_per_lane, + len = type_len(t) / elements_per_lane + ) +} + fn asc(start: i32, len: usize) -> String { let mut s = String::from("["); for i in 0..len { @@ -2515,7 +2528,7 @@ fn gen_arm( {function_doc} #[inline] -#[cfg(target_arch = "aarch64")]{target_feature_aarch64} +#[cfg(not(target_arch = "arm"))]{target_feature_aarch64} #[cfg_attr(test, assert_instr({assert_aarch64}{const_assert}))]{const_legacy}{stable_aarch64} {call_aarch64} "#, @@ -2993,6 +3006,12 @@ fn get_call( re = Some((re_params[0].clone(), in_t[1].to_string())); } else if re_params[1] == "out_t" { re = Some((re_params[0].clone(), out_t.to_string())); + } else if re_params[1] == "out_unsigned" { + re = Some((re_params[0].clone(), type_to_unsigned(out_t).to_string())); + } else if re_params[1] == "out_signed" { + re = Some((re_params[0].clone(), type_to_signed(out_t).to_string())); + } else if re_params[1] == "merge4_t2" { + re = Some((re_params[0].clone(), type_with_merged_lanes(in_t[2], 4))); } else if re_params[1] == "half" { re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string())); } else if re_params[1] == "in_ntt" { |