diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /library/stdarch/crates/stdarch-gen | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'library/stdarch/crates/stdarch-gen')
-rw-r--r-- | library/stdarch/crates/stdarch-gen/Cargo.toml | 9 | ||||
-rw-r--r-- | library/stdarch/crates/stdarch-gen/README.md | 11 | ||||
-rw-r--r-- | library/stdarch/crates/stdarch-gen/neon.spec | 7560 | ||||
-rw-r--r-- | library/stdarch/crates/stdarch-gen/src/main.rs | 3391 |
4 files changed, 10971 insertions, 0 deletions
diff --git a/library/stdarch/crates/stdarch-gen/Cargo.toml b/library/stdarch/crates/stdarch-gen/Cargo.toml new file mode 100644 index 000000000..b339672f4 --- /dev/null +++ b/library/stdarch/crates/stdarch-gen/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "stdarch-gen" +version = "0.1.0" +authors = ["Heinz Gies <heinz@licenser.net>"] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/library/stdarch/crates/stdarch-gen/README.md b/library/stdarch/crates/stdarch-gen/README.md new file mode 100644 index 000000000..54b602cdd --- /dev/null +++ b/library/stdarch/crates/stdarch-gen/README.md @@ -0,0 +1,11 @@ +# Neon intrinsic code generator + +A small tool that allows to quickly generate intrinsics for the NEON architecture. + +The specification for the intrinsics can be found in `neon.spec`. + +To run and re-generate the code run the following from the root of the `stdarch` crate. + +``` +OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec +```
\ No newline at end of file diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec new file mode 100644 index 000000000..68a50fbe9 --- /dev/null +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -0,0 +1,7560 @@ +// ARM Neon intrinsic specification. +// +// This file contains the specification for a number of +// intrinsics that allows us to generate them along with +// their test cases. +// +// To the syntax of the file - it's not very intelligently parsed! +// +// # Comments +// start with AT LEAST two, or four or more slashes so // is a +// comment /////// is too. +// +// # Sections +// Sections start with EXACTLY three slashes followed +// by AT LEAST one space. Sections are used for two things: +// +// 1) they serve as the doc comment for the given intrinics. +// 2) they reset all variables (name, fn, etc.) +// +// # Variables +// +// name - The prefix of the function, suffixes are auto +// generated by the type they get passed. +// +// fn - The function to call in rust-land. +// +// aarch64 - The intrinsic to check on aarch64 architecture. +// If this is given but no arm intrinsic is provided, +// the function will exclusively be generated for +// aarch64. +// This is used to generate both aarch64 specific and +// shared intrinics by first only specifying th aarch64 +// variant then the arm variant. +// +// arm - The arm v7 intrinics used to checked for arm code +// generation. All neon functions available in arm are +// also available in aarch64. If no aarch64 intrinic was +// set they are assumed to be the same. +// Intrinics ending with a `.` will have a size suffixes +// added (such as `i8` or `i64`) that is not sign specific +// Intrinics ending with a `.s` will have a size suffixes +// added (such as `s8` or `u64`) that is sign specific +// +// a - First input for tests, it gets scaled to the size of +// the type. +// +// b - Second input for tests, it gets scaled to the size of +// the type. +// +// # special values +// +// TRUE - 'true' all bits are set to 1 +// FALSE - 'false' all bits are set to 0 +// FF - same as 'true' +// MIN - minimal value (either 0 or the lowest negative number) +// MAX - maximal value proper to overflow +// +// # validate <values> +// Validates a and b against the expected result of the test. +// The special values 'TRUE' and 'FALSE' can be used to +// represent the correct NEON representation of true or +// false values. It too gets scaled to the type. +// +// Validate needs to be called before generate as it sets +// up the rules for validation that get generated for each +// type. +// # generate <types> +// The generate command generates the intrinsics, it uses the +// Variables set and can be called multiple times while overwriting +// some of the variables. + +/// Vector bitwise and +name = vand +fn = simd_and +arm = vand +aarch64 = and +a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 +b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F +validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 +b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +generate int*_t, uint*_t, int64x*_t, uint64x*_t + +/// Vector bitwise or (immediate, inclusive) +name = vorr +fn = simd_or +arm = vorr +aarch64 = orr +a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +generate int*_t, uint*_t, int64x*_t, uint64x*_t + + +/// Vector bitwise exclusive or (vector) +name = veor +fn = simd_xor +arm = veor +aarch64 = eor +a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +generate int*_t, uint*_t, int64x*_t, uint64x*_t + +/// Three-way exclusive OR +name = veor3 +a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +c = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +target = sha3 + +aarch64 = eor3 +link-aarch64 = llvm.aarch64.crypto.eor3s._EXT_ +generate int8x16_t, int16x8_t, int32x4_t, int64x2_t +link-aarch64 = llvm.aarch64.crypto.eor3u._EXT_ +generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t + +//////////////////// +// Absolute difference between the arguments +//////////////////// + +/// Absolute difference between the arguments +name = vabd +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15 + +arm = vabd.s +aarch64 = sabd +link-arm = vabds._EXT_ +link-aarch64 = sabd._EXT_ +generate int*_t + +arm = vabd.s +aarch64 = uabd +link-arm = vabdu._EXT_ +link-aarch64 = uabd._EXT_ +generate uint*_t + +/// Absolute difference between the arguments of Floating +name = vabd +a = 1.0, 2.0, 5.0, -4.0 +b = 9.0, 3.0, 2.0, 8.0 +validate 8.0, 1.0, 3.0, 12.0 + +aarch64 = fabd +link-aarch64 = fabd._EXT_ +generate float64x*_t + +arm = vabd.s +aarch64 = fabd +link-arm = vabds._EXT_ +link-aarch64 = fabd._EXT_ +generate float*_t + +/// Floating-point absolute difference +name = vabd +multi_fn = simd_extract, {vabd-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 1.0 +b = 9.0 +validate 8.0 + +aarch64 = fabd +generate f32, f64 + +//////////////////// +// Absolute difference Long +//////////////////// + +/// Unsigned Absolute difference Long +name = vabdl +multi_fn = simd_cast, {vabd-unsigned-noext, a, b} +a = 1, 2, 3, 4, 4, 3, 2, 1 +b = 10, 10, 10, 10, 10, 10, 10, 10 +validate 9, 8, 7, 6, 6, 7, 8, 9 + +arm = vabdl.s +aarch64 = uabdl +generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t + +/// Signed Absolute difference Long +name = vabdl +multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b} +multi_fn = simd_cast, c +a = 1, 2, 3, 4, 4, 3, 2, 1 +b = 10, 10, 10, 10, 10, 10, 10, 10 +validate 9, 8, 7, 6, 6, 7, 8, 9 + +arm = vabdl.s +aarch64 = sabdl +generate int8x8_t:int8x8_t:int16x8_t + +/// Signed Absolute difference Long +name = vabdl +multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b} +multi_fn = simd_cast, c +a = 1, 2, 11, 12 +b = 10, 10, 10, 10 +validate 9, 8, 1, 2 + +arm = vabdl.s +aarch64 = sabdl +generate int16x4_t:int16x4_t:int32x4_t + +/// Signed Absolute difference Long +name = vabdl +multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b} +multi_fn = simd_cast, c +a = 1, 11 +b = 10, 10 +validate 9, 1 + +arm = vabdl.s +aarch64 = sabdl +generate int32x2_t:int32x2_t:int64x2_t + +/// Unsigned Absolute difference Long +name = vabdl_high +no-q +multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_cast, {vabd_u8, c, d} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 +validate 1, 0, 1, 2, 3, 4, 5, 6 + +aarch64 = uabdl +generate uint8x16_t:uint8x16_t:uint16x8_t + +/// Unsigned Absolute difference Long +name = vabdl_high +no-q +multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] +multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_cast, {vabd_u16, c, d} +a = 1, 2, 3, 4, 8, 9, 11, 12 +b = 10, 10, 10, 10, 10, 10, 10, 10 +validate 2, 1, 1, 2 + +aarch64 = uabdl +generate uint16x8_t:uint16x8_t:uint32x4_t + +/// Unsigned Absolute difference Long +name = vabdl_high +no-q +multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] +multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] +multi_fn = simd_cast, {vabd_u32, c, d} +a = 1, 2, 3, 4 +b = 10, 10, 10, 10 +validate 7, 6 + +aarch64 = uabdl +generate uint32x4_t:uint32x4_t:uint64x2_t + +/// Signed Absolute difference Long +name = vabdl_high +no-q +multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d} +multi_fn = simd_cast, e +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 +validate 1, 0, 1, 2, 3, 4, 5, 6 + +aarch64 = sabdl +generate int8x16_t:int8x16_t:int16x8_t + +/// Signed Absolute difference Long +name = vabdl_high +no-q +multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] +multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d} +multi_fn = simd_cast, e +a = 1, 2, 3, 4, 9, 10, 11, 12 +b = 10, 10, 10, 10, 10, 10, 10, 10 +validate 1, 0, 1, 2 + +aarch64 = sabdl +generate int16x8_t:int16x8_t:int32x4_t + +/// Signed Absolute difference Long +name = vabdl_high +no-q +multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] +multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] +multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d} +multi_fn = simd_cast, e +a = 1, 2, 3, 4 +b = 10, 10, 10, 10 +validate 7, 6 + +aarch64 = sabdl +generate int32x4_t:int32x4_t:int64x2_t + +//////////////////// +// equality +//////////////////// + +/// Compare bitwise Equal (vector) +name = vceq +fn = simd_eq +a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX +b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE +a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX +b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN +validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE + +aarch64 = cmeq +generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t + +arm = vceq. +generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t + +/// Floating-point compare equal +name = vceq +fn = simd_eq +a = 1.2, 3.4, 5.6, 7.8 +b = 1.2, 3.4, 5.6, 7.8 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = fcmeq +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +arm = vceq. +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Compare bitwise equal +name = vceq +multi_fn = transmute, {vceq-in_ntt-noext, {transmute, a}, {transmute, b}} +a = 1 +b = 2 +validate 0 + +aarch64 = cmp +generate i64:u64, u64 + +/// Floating-point compare equal +name = vceq +multi_fn = simd_extract, {vceq-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 1. +b = 2. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Signed compare bitwise equal to zero +name = vceqz +fn = simd_eq +a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + +aarch64 = cmeq +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t + +/// Unsigned compare bitwise equal to zero +name = vceqz +fn = simd_eq +a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + +aarch64 = cmeq +generate uint*_t, uint64x*_t + +/// Floating-point compare bitwise equal to zero +name = vceqz +fn = simd_eq +a = 0.0, 1.2, 3.4, 5.6 +fixed = 0.0, 0.0, 0.0, 0.0 +validate TRUE, FALSE, FALSE, FALSE + +aarch64 = fcmeq +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +/// Compare bitwise equal to zero +name = vceqz +multi_fn = transmute, {vceqz-in_ntt-noext, {transmute, a}} +a = 1 +validate 0 + +aarch64 = cmp +generate i64:u64, u64 + +/// Floating-point compare bitwise equal to zero +name = vceqz +multi_fn = simd_extract, {vceqz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = 1. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Signed compare bitwise Test bits nonzero +name = vtst +multi_fn = simd_and, c:in_t, a, b +multi_fn = fixed, d:in_t +multi_fn = simd_ne, c, transmute(d) +a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX +b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmtst +generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t + +arm = vtst +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly16x4_t:uint16x4_t, poly16x8_t:uint16x8_t + +/// Unsigned compare bitwise Test bits nonzero +name = vtst +multi_fn = simd_and, c:in_t, a, b +multi_fn = fixed, d:in_t +multi_fn = simd_ne, c, transmute(d) +a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX +b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmtst +generate uint64x*_t + +arm = vtst +generate uint*_t + +/// Compare bitwise test bits nonzero +name = vtst +multi_fn = transmute, {vtst-in_ntt-noext, {transmute, a}, {transmute, b}} +a = 0 +b = 0 +validate 0 + +aarch64 = tst +generate i64:i64:u64, u64 + +/// Signed saturating accumulate of unsigned value +name = vuqadd +out-suffix +a = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 +b = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 +validate 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8 + +aarch64 = suqadd +link-aarch64 = suqadd._EXT_ +generate i32:u32:i32, i64:u64:i64 + +/// Signed saturating accumulate of unsigned value +name = vuqadd +out-suffix +multi_fn = simd_extract, {vuqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 1 +b = 2 +validate 3 + +aarch64 = suqadd +generate i8:u8:i8, i16:u16:i16 + +//////////////////// +// Floating-point absolute value +//////////////////// + +/// Floating-point absolute value +name = vabs +fn = simd_fabs +a = -0.1, -2.2, -3.3, -6.6 +validate 0.1, 2.2, 3.3, 6.6 +aarch64 = fabs +generate float64x1_t:float64x1_t, float64x2_t:float64x2_t + +arm = vabs +generate float32x2_t:float32x2_t, float32x4_t:float32x4_t + +//////////////////// +// greater then +//////////////////// + +/// Compare signed greater than +name = vcgt +fn = simd_gt +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE +aarch64 = cmgt +generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +arm = vcgt.s +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t + +/// Compare unsigned highe +name = vcgt +fn = simd_gt +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmhi +generate uint64x*_t + +arm = vcgt.s +generate uint*_t + +/// Floating-point compare greater than +name = vcgt +fn = simd_gt +a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 +b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = fcmgt +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +arm = vcgt.s +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Compare greater than +name = vcgt +multi_fn = transmute, {vcgt-in_ntt-noext, {transmute, a}, {transmute, b}} +a = 1 +b = 2 +validate 0 + +aarch64 = cmp +generate i64:u64, u64 + +/// Floating-point compare greater than +name = vcgt +multi_fn = simd_extract, {vcgt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 1. +b = 2. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +//////////////////// +// lesser then +//////////////////// + +/// Compare signed less than +name = vclt +fn = simd_lt +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE +aarch64 = cmgt +generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +arm = vcgt.s +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t + +/// Compare unsigned less than +name = vclt +fn = simd_lt +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmhi +generate uint64x*_t + +arm = vcgt.s +generate uint*_t + +/// Floating-point compare less than +name = vclt +fn = simd_lt +a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 +b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = fcmgt +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +arm = vcgt.s +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Compare less than +name = vclt +multi_fn = transmute, {vclt-in_ntt-noext, {transmute, a}, {transmute, b}} +a = 2 +b = 1 +validate 0 + +aarch64 = cmp +generate i64:u64, u64 + +/// Floating-point compare less than +name = vclt +multi_fn = simd_extract, {vclt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 2. +b = 1. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +//////////////////// +// lesser then equals +//////////////////// + +/// Compare signed less than or equal +name = vcle +fn = simd_le +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmge +generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +arm = vcge.s +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t + +/// Compare greater than or equal +name = vcge +multi_fn = transmute, {vcge-in_ntt-noext, {transmute, a}, {transmute, b}} +a = 1 +b = 2 +validate 0 + +aarch64 = cmp +generate i64:u64, u64 + +/// Floating-point compare greater than or equal +name = vcge +multi_fn = simd_extract, {vcge-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 1. +b = 2. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Compare unsigned less than or equal +name = vcle +fn = simd_le +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmhs +generate uint64x*_t + +arm = vcge.s +generate uint*_t + +/// Floating-point compare less than or equal +name = vcle +fn = simd_le +a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 +b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE +aarch64 = fcmge +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +arm = vcge.s +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Compare less than or equal +name = vcle +multi_fn = transmute, {vcle-in_ntt-noext, {transmute, a}, {transmute, b}} +a = 2 +b = 1 +validate 0 + +aarch64 = cmp +generate i64:u64, u64 + +/// Floating-point compare less than or equal +name = vcle +multi_fn = simd_extract, {vcle-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 2. +b = 1. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +//////////////////// +// greater then equals +//////////////////// + +/// Compare signed greater than or equal +name = vcge +fn = simd_ge +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmge +generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +arm = vcge.s +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t + +/// Compare unsigned greater than or equal +name = vcge +fn = simd_ge +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmhs +generate uint64x*_t + +arm = vcge.s +generate uint*_t + +/// Floating-point compare greater than or equal +name = vcge +fn = simd_ge +a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 +b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 +validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = fcmge +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +arm = vcge.s +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Compare signed greater than or equal to zero +name = vcgez +fn = simd_ge +a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmge +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +/// Floating-point compare greater than or equal to zero +name = vcgez +fn = simd_ge +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 +validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = fcmge +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +/// Compare signed greater than or equal to zero +name = vcgez +multi_fn = transmute, {vcgez-in_ntt-noext, {transmute, a}} +a = -1 +validate 0 + +aarch64 = eor +generate i64:u64 + +/// Floating-point compare greater than or equal to zero +name = vcgez +multi_fn = simd_extract, {vcgez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = -1. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Compare signed greater than zero +name = vcgtz +fn = simd_gt +a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = cmgt +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +/// Floating-point compare greater than zero +name = vcgtz +fn = simd_gt +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 +validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + +aarch64 = fcmgt +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +/// Compare signed greater than zero +name = vcgtz +multi_fn = transmute, {vcgtz-in_ntt-noext, {transmute, a}} +a = -1 +validate 0 + +aarch64 = cmp +generate i64:u64 + +/// Floating-point compare greater than zero +name = vcgtz +multi_fn = simd_extract, {vcgtz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = -1. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Compare signed less than or equal to zero +name = vclez +fn = simd_le +a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + +aarch64 = cmgt +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +/// Floating-point compare less than or equal to zero +name = vclez +fn = simd_le +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 +validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + +aarch64 = fcmle +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +/// Compare less than or equal to zero +name = vclez +multi_fn = transmute, {vclez-in_ntt-noext, {transmute, a}} +a = 2 +validate 0 + +aarch64 = cmp +generate i64:u64 + +/// Floating-point compare less than or equal to zero +name = vclez +multi_fn = simd_extract, {vclez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = 2. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Compare signed less than zero +name = vcltz +fn = simd_lt +a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX +fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + +aarch64 = cmlt +generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t + +/// Floating-point compare less than zero +name = vcltz +fn = simd_lt +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 +validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + +aarch64 = fcmlt +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +/// Compare less than zero +name = vcltz +multi_fn = transmute, {vcltz-in_ntt-noext, {transmute, a}} +a = 2 +validate 0 + +aarch64 = asr +generate i64:u64 + +/// Floating-point compare less than zero +name = vcltz +multi_fn = simd_extract, {vcltz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = 2. +validate 0 + +aarch64 = fcmp +generate f32:u32, f64:u64 + +/// Count leading sign bits +name = vcls +a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX +validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 + +arm = vcls.s +aarch64 = cls +link-arm = vcls._EXT_ +link-aarch64 = cls._EXT_ +generate int*_t + +/// Count leading sign bits +name = vcls +multi_fn = transmute, {vcls-signed-noext, {transmute, a}} +a = MIN, MAX, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX +validate BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1 + +arm = vcls +aarch64 = cls +generate uint8x8_t:int8x8_t, uint8x16_t:int8x16_t, uint16x4_t:int16x4_t, uint16x8_t:int16x8_t, uint32x2_t:int32x2_t, uint32x4_t:int32x4_t + +/// Count leading zero bits +name = vclz +multi_fn = self-signed-ext, a +a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX +validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1 + +arm = vclz. +aarch64 = clz +generate int*_t + +/// Count leading zero bits +name = vclz +multi_fn = transmute, {self-signed-ext, transmute(a)} +a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX +validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 + +arm = vclz. +aarch64 = clz +generate uint*_t + +/// Floating-point absolute compare greater than +name = vcagt +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 +validate !0, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE + +aarch64 = facgt +link-aarch64 = facgt._EXT2_._EXT_ +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +arm = vacgt.s +link-arm = vacgt._EXT2_._EXT_ +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Floating-point absolute compare greater than or equal +name = vcage +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 +validate !0, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE + +aarch64 = facge +link-aarch64 = facge._EXT2_._EXT_ +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +arm = vacge.s +link-arm = vacge._EXT2_._EXT_ +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Floating-point absolute compare less than +name = vcalt +multi_fn = vcagt-self-noext, b, a +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 +validate 0, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE + +aarch64 = facgt +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +arm = vacgt.s +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Floating-point absolute compare less than or equal +name = vcale +multi_fn = vcage-self-noext , b, a +a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 +b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 +validate 0, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE + +aarch64 = facge +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +arm = vacge.s +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 0:1 +validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = mov +generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t +generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t +generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} +a = 1., 2., 3., 4. +b = 0., 0.5, 0., 0. +n = 0:1 +validate 0.5, 2., 3., 4. + +aarch64 = mov +generate float32x2_t, float32x4_t, float64x2_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 0:1 +validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = mov +generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t +generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t +generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} +a = 1., 2., 3., 4. +b = 0., 0.5, 0., 0. +n = 0:1 +validate 0.5, 2., 3., 4. + +aarch64 = mov +generate float32x2_t:float32x4_t:float32x2_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 0:1 +validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = mov +generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t +generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t +generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1:0 +validate 1, MAX + +aarch64 = mov +generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t + +/// Insert vector element from another vector element +name = vcopy +lane-suffixes +constn = LANE1:LANE2 +multi_fn = static_assert_imm-in0_exp_len-LANE1 +multi_fn = static_assert_imm-in_exp_len-LANE2 +multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} +multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} +a = 1., 2., 3., 4. +b = 0.5, 0., 0., 0. +n = 1:0 +validate 1., 0.5, 3., 4. + +aarch64 = mov +generate float32x4_t:float32x2_t:float32x4_t +aarch64 = mov +generate float64x2_t:float64x1_t:float64x2_t + +/// Insert vector element from another vector element +name = vcreate +out-suffix +multi_fn = transmute, a +a = 1 +validate 1, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = nop +arm = nop +generate u64:int8x8_t, u64:int16x4_t, u64:int32x2_t, u64:int64x1_t +generate u64:uint8x8_t, u64:uint16x4_t, u64:uint32x2_t, u64:uint64x1_t +generate u64:poly8x8_t, u64:poly16x4_t +target = aes +generate u64:poly64x1_t + +/// Insert vector element from another vector element +name = vcreate +out-suffix +multi_fn = transmute, a +a = 0 +validate 0., 0. + +aarch64 = nop +generate u64:float64x1_t +arm = nop +generate u64:float32x2_t + +/// Fixed-point convert to floating-point +name = vcvt +double-suffixes +fn = simd_cast +a = 1, 2, 3, 4 +validate 1., 2., 3., 4. + +aarch64 = scvtf +generate int64x1_t:float64x1_t, int64x2_t:float64x2_t +aarch64 = ucvtf +generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t + +arm = vcvt +aarch64 = scvtf +generate int32x2_t:float32x2_t, int32x4_t:float32x4_t +aarch64 = ucvtf +generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t + +/// Floating-point convert to higher precision long +name = vcvt +double-suffixes +fn = simd_cast +a = -1.2, 1.2 +validate -1.2f32 as f64, 1.2f32 as f64 + +aarch64 = fcvtl +generate float32x2_t:float64x2_t + +/// Floating-point convert to higher precision long +name = vcvt_high +noq-double-suffixes +multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3] +multi_fn = simd_cast, b +a = -1.2, 1.2, 2.3, 3.4 +validate 2.3f32 as f64, 3.4f32 as f64 + +aarch64 = fcvtl +generate float32x4_t:float64x2_t + +/// Floating-point convert to lower precision narrow +name = vcvt +double-suffixes +fn = simd_cast +a = -1.2, 1.2 +validate -1.2f64 as f32, 1.2f64 as f32 + +aarch64 = fcvtn +generate float64x2_t:float32x2_t + +/// Floating-point convert to lower precision narrow +name = vcvt_high +noq-double-suffixes +multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3] +a = -1.2, 1.2 +b = -2.3, 3.4 +validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32 + +aarch64 = fcvtn +generate float32x2_t:float64x2_t:float32x4_t + +/// Floating-point convert to lower precision narrow, rounding to odd +name = vcvtx +double-suffixes +a = -1.0, 2.0 +validate -1.0, 2.0 + +aarch64 = fcvtxn +link-aarch64 = fcvtxn._EXT2_._EXT_ +generate float64x2_t:float32x2_t + +/// Floating-point convert to lower precision narrow, rounding to odd +name = vcvtx +double-suffixes +multi_fn = simd_extract, {vcvtx-_f32_f64-noext, {vdupq_n-in_ntt-noext, a}}, 0 +a = -1.0 +validate -1.0 + +aarch64 = fcvtxn +generate f64:f32 + +/// Floating-point convert to lower precision narrow, rounding to odd +name = vcvtx_high +noq-double-suffixes +multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3] +a = -1.0, 2.0 +b = -3.0, 4.0 +validate -1.0, 2.0, -3.0, 4.0 + +aarch64 = fcvtxn +generate float32x2_t:float64x2_t:float32x4_t + +/// Fixed-point convert to floating-point +name = vcvt +double-n-suffixes +constn = N +multi_fn = static_assert-N-1-bits +a = 1, 2, 3, 4 +n = 2 +validate 0.25, 0.5, 0.75, 1. +arm-aarch64-separate + +aarch64 = scvtf +link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ +const-aarch64 = N +generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64 + +aarch64 = ucvtf +link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ +const-aarch64 = N +generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64 + +aarch64 = scvtf +link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ +arm = vcvt +link-arm = vcvtfxs2fp._EXT2_._EXT_ +const-arm = N:i32 + +generate int32x2_t:float32x2_t, int32x4_t:float32x4_t + +aarch64 = ucvtf +link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ +arm = vcvt +link-arm = vcvtfxu2fp._EXT2_._EXT_ +const-arm = N:i32 +generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t + +/// Floating-point convert to fixed-point, rounding toward zero +name = vcvt +double-n-suffixes +constn = N +multi_fn = static_assert-N-1-bits +a = 0.25, 0.5, 0.75, 1. +n = 2 +validate 1, 2, 3, 4 +arm-aarch64-separate + +aarch64 = fcvtzs +link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ +const-aarch64 = N +generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 + +aarch64 = fcvtzu +link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ +const-aarch64 = N +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +aarch64 = fcvtzs +link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ +arm = vcvt +link-arm = vcvtfp2fxs._EXT2_._EXT_ +const-arm = N:i32 +generate float32x2_t:int32x2_t, float32x4_t:int32x4_t + +aarch64 = fcvtzu +link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ +arm = vcvt +link-arm = vcvtfp2fxu._EXT2_._EXT_ +const-arm = N:i32 +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Fixed-point convert to floating-point +name = vcvt +double-suffixes +multi_fn = a as out_t +a = 1 +validate 1. + +aarch64 = scvtf +generate i32:f32, i64:f64 +aarch64 = ucvtf +generate u32:f32, u64:f64 + +/// Fixed-point convert to floating-point +name = vcvt +double-suffixes +multi_fn = a as out_t +a = 1. +validate 1 + +aarch64 = fcvtzs +generate f32:i32, f64:i64 +aarch64 = fcvtzu +generate f32:u32, f64:u64 + +/// Floating-point convert to signed fixed-point, rounding toward zero +name = vcvt +double-suffixes +link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_ +a = -1.1, 2.1, -2.9, 3.9 +validate -1, 2, -2, 3 + +aarch64 = fcvtzs +generate float64x1_t:int64x1_t, float64x2_t:int64x2_t + +link-arm = llvm.fptosi.sat._EXT2_._EXT_ +arm = vcvt +generate float32x2_t:int32x2_t, float32x4_t:int32x4_t + +/// Floating-point convert to unsigned fixed-point, rounding toward zero +name = vcvt +double-suffixes +link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_ +a = 1.1, 2.1, 2.9, 3.9 +validate 1, 2, 2, 3 + +aarch64 = fcvtzu +generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +link-arm = llvm.fptoui.sat._EXT2_._EXT_ +arm = vcvt +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t + +/// Floating-point convert to signed integer, rounding to nearest with ties to away +name = vcvta +double-suffixes +a = -1.1, 2.1, -2.9, 3.9 +validate -1, 2, -3, 4 + +aarch64 = fcvtas +link-aarch64 = fcvtas._EXT2_._EXT_ +generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t + +/// Floating-point convert to integer, rounding to nearest with ties to away +name = vcvta +double-suffixes +a = 2.9 +validate 3 + +aarch64 = fcvtas +link-aarch64 = fcvtas._EXT2_._EXT_ +generate f32:i32, f64:i64 + +aarch64 = fcvtau +link-aarch64 = fcvtau._EXT2_._EXT_ +generate f32:u32, f64:u64 + +/// Floating-point convert to signed integer, rounding to nearest with ties to even +name = vcvtn +double-suffixes +a = -1.5, 2.1, -2.9, 3.9 +validate -2, 2, -3, 4 + +aarch64 = fcvtns +link-aarch64 = fcvtns._EXT2_._EXT_ +generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 + +/// Floating-point convert to signed integer, rounding toward minus infinity +name = vcvtm +double-suffixes +a = -1.1, 2.1, -2.9, 3.9 +validate -2, 2, -3, 3 + +aarch64 = fcvtms +link-aarch64 = fcvtms._EXT2_._EXT_ +generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 + +/// Floating-point convert to signed integer, rounding toward plus infinity +name = vcvtp +double-suffixes +a = -1.1, 2.1, -2.9, 3.9 +validate -1, 3, -2, 4 + +aarch64 = fcvtps +link-aarch64 = fcvtps._EXT2_._EXT_ +generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 + +/// Floating-point convert to unsigned integer, rounding to nearest with ties to away +name = vcvta +double-suffixes +a = 1.1, 2.1, 2.9, 3.9 +validate 1, 2, 3, 4 + +aarch64 = fcvtau +link-aarch64 = fcvtau._EXT2_._EXT_ +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t + +/// Floating-point convert to unsigned integer, rounding to nearest with ties to even +name = vcvtn +double-suffixes +a = 1.5, 2.1, 2.9, 3.9 +validate 2, 2, 3, 4 + +aarch64 = fcvtnu +link-aarch64 = fcvtnu._EXT2_._EXT_ +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +/// Floating-point convert to unsigned integer, rounding toward minus infinity +name = vcvtm +double-suffixes +a = 1.1, 2.1, 2.9, 3.9 +validate 1, 2, 2, 3 + +aarch64 = fcvtmu +link-aarch64 = fcvtmu._EXT2_._EXT_ +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +/// Floating-point convert to unsigned integer, rounding toward plus infinity +name = vcvtp +double-suffixes +a = 1.1, 2.1, 2.9, 3.9 +validate 2, 3, 3, 4 + +aarch64 = fcvtpu +link-aarch64 = fcvtpu._EXT2_._EXT_ +generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} +a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 +n = HFLEN +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = dup +generate poly64x2_t, poly64x1_t:poly64x2_t + +arm = vdup.l +generate int*_t +generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t +generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t + +generate uint*_t +generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t +generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t + +generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t +generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t +generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} +a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 +n = HFLEN +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = dup +arm = vmov +generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} +a = 1., 1., 1., 4. +n = HFLEN +validate 1., 1., 1., 1. + +aarch64 = dup +generate float64x2_t, float64x1_t:float64x2_t + +arm = vdup.l +generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = a +a = 0 +n = HFLEN +validate 0 + +aarch64 = nop +generate poly64x1_t + +arm = nop +generate int64x1_t, uint64x1_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = a +a = 0. +n = HFLEN +validate 0. + +aarch64 = nop +generate float64x1_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32} +a = 0, 1 +n = HFLEN +validate 1 + +aarch64 = nop +generate poly64x2_t:poly64x1_t + +arm = vmov +generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32} +a = 0., 1. +n = HFLEN +validate 1. + +aarch64 = nop +generate float64x2_t:float64x1_t + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, a, N as u32 +a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 +n = HFLEN +validate 1 + +aarch64 = nop +generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64 +generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64 +generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16 + +/// Set all vector lanes to the same value +name = vdup +lane-suffixes +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, a, N as u32 +a = 1., 1., 1., 4. +n = HFLEN +validate 1. + +aarch64 = nop +generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64 + +/// Extract vector from pair of vectors +name = vext +constn = N +multi_fn = static_assert_imm-out_exp_len-N +multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} +a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 +b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 +n = HFLEN +validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 + +arm = "vext.8" +aarch64 = ext +generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t + +/// Extract vector from pair of vectors +name = vext +constn = N +multi_fn = static_assert_imm-out_exp_len-N +multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} +a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 +b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 +n = HFLEN +validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 + +aarch64 = ext +generate poly64x2_t + +arm = vmov +generate int64x2_t, uint64x2_t + +/// Extract vector from pair of vectors +name = vext +constn = N +multi_fn = static_assert_imm-out_exp_len-N +multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} +a = 0., 2., 2., 3. +b = 3., 4., 5., 6., +n = HFLEN +validate 2., 3., 3., 4. + +aarch64 = ext +generate float64x2_t + +arm = "vext.8" +generate float*_t + +/// Multiply-add to accumulator +name = vmla +multi_fn = simd_add, a, {simd_mul, b, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmla. +aarch64 = mla +generate int*_t, uint*_t + +/// Floating-point multiply-add to accumulator +name = vmla +multi_fn = simd_add, a, {simd_mul, b, c} +a = 0., 1., 2., 3. +b = 2., 2., 2., 2. +c = 3., 3., 3., 3. +validate 6., 7., 8., 9. + +aarch64 = fmul +generate float64x*_t + +arm = vmla. +generate float*_t + +/// Vector multiply accumulate with scalar +name = vmla +n-suffix +multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +aarch64 = mla +arm = vmla. +generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t +generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t + +/// Vector multiply accumulate with scalar +name = vmla +n-suffix +multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} +a = 0., 1., 2., 3. +b = 2., 2., 2., 2. +c = 3. +validate 6., 7., 8., 9. + +aarch64 = fmul +arm = vmla. +generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t + +/// Vector multiply accumulate with scalar +name = vmla +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +aarch64 = mla +arm = vmla. +generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t +generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t +generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t + +/// Vector multiply accumulate with scalar +name = vmla +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 0., 1., 2., 3. +b = 2., 2., 2., 2. +c = 0., 3., 0., 0. +n = 1 +validate 6., 7., 8., 9. + +aarch64 = fmul +arm = vmla. +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Signed multiply-add long +name = vmlal +multi_fn = simd_add, a, {vmull-self-noext, b, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmlal.s +aarch64 = smlal +generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Unsigned multiply-add long +name = vmlal +multi_fn = simd_add, a, {vmull-self-noext, b, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmlal.s +aarch64 = umlal +generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t + +/// Vector widening multiply accumulate with scalar +name = vmlal +n-suffix +multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmlal.s +aarch64 = smlal +generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t +aarch64 = umlal +generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t + +/// Vector widening multiply accumulate with scalar +name = vmlal_lane +in2-suffix +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmlal.s +aarch64 = smlal +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t +generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t +aarch64 = umlal +generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t +generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t + +/// Signed multiply-add long +name = vmlal_high +no-q +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} +multi_fn = vmlal-noqself-noext, a, b, c +a = 8, 7, 6, 5, 4, 3, 2, 1 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = smlal2 +generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Unsigned multiply-add long +name = vmlal_high +no-q +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} +multi_fn = vmlal-noqself-noext, a, b, c +a = 8, 7, 6, 5, 4, 3, 2, 1 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = umlal2 +generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + +/// Multiply-add long +name = vmlal_high_n +no-q +multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} +a = 8, 7, 6, 5, 4, 3, 2, 1 +b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +c = 2 +validate 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = smlal2 +generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t +aarch64 = umlal2 +generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t + +/// Multiply-add long +name = vmlal_high_lane +in2-suffix +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 8, 7, 6, 5, 4, 3, 2, 1 +b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = smlal2 +generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t +generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t +aarch64 = umlal2 +generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t +generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + +/// Multiply-subtract from accumulator +name = vmls +multi_fn = simd_sub, a, {simd_mul, b, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmls. +aarch64 = mls +generate int*_t, uint*_t + +/// Floating-point multiply-subtract from accumulator +name = vmls +multi_fn = simd_sub, a, {simd_mul, b, c} +a = 6., 7., 8., 9. +b = 2., 2., 2., 2. +c = 3., 3., 3., 3. +validate 0., 1., 2., 3. + +aarch64 = fmul +generate float64x*_t + +arm = vmls. +generate float*_t + +/// Vector multiply subtract with scalar +name = vmls +n-suffix +multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = mls +arm = vmls. +generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t +generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t + +/// Vector multiply subtract with scalar +name = vmls +n-suffix +multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} +a = 6., 7., 8., 9. +b = 2., 2., 2., 2. +c = 3. +validate 0., 1., 2., 3. + +aarch64 = fmul +arm = vmls. +generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t + +/// Vector multiply subtract with scalar +name = vmls +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = mls +arm = vmls. +generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t +generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t +generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t + +/// Vector multiply subtract with scalar +name = vmls +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 6., 7., 8., 9. +b = 2., 2., 2., 2. +c = 0., 3., 0., 0. +n = 1 +validate 0., 1., 2., 3. + +aarch64 = fmul +arm = vmls. +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Signed multiply-subtract long +name = vmlsl +multi_fn = simd_sub, a, {vmull-self-noext, b, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmlsl.s +aarch64 = smlsl +generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Unsigned multiply-subtract long +name = vmlsl +multi_fn = simd_sub, a, {vmull-self-noext, b, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmlsl.s +aarch64 = umlsl +generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t + +/// Vector widening multiply subtract with scalar +name = vmlsl +n-suffix +multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmlsl.s +aarch64 = smlsl +generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t +aarch64 = umlsl +generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t + +/// Vector widening multiply subtract with scalar +name = vmlsl_lane +in2-suffix +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmlsl.s +aarch64 = smlsl +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t +generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t +aarch64 = umlsl +generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t +generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t + +/// Signed multiply-subtract long +name = vmlsl_high +no-q +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} +multi_fn = vmlsl-noqself-noext, a, b, c +a = 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 14, 13, 12, 11, 10, 9, 8, 7 + +aarch64 = smlsl2 +generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Unsigned multiply-subtract long +name = vmlsl_high +no-q +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} +multi_fn = vmlsl-noqself-noext, a, b, c +a = 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 14, 13, 12, 11, 10, 9, 8, 7 + +aarch64 = umlsl2 +generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + +/// Multiply-subtract long +name = vmlsl_high_n +no-q +multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} +a = 14, 15, 16, 17, 18, 19, 20, 21 +b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +c = 2 +validate 14, 13, 12, 11, 10, 9, 8, 7 + +aarch64 = smlsl2 +generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t +aarch64 = umlsl2 +generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t + +/// Multiply-subtract long +name = vmlsl_high_lane +in2-suffix +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} +a = 14, 15, 16, 17, 18, 19, 20, 21 +b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 14, 13, 12, 11, 10, 9, 8, 7 + +aarch64 = smlsl2 +generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t +generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t +aarch64 = umlsl2 +generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t +generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + +/// Extract narrow +name = vmovn_high +no-q +multi_fn = simd_cast, c:in_t0, b +multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len} +a = 0, 1, 2, 3, 2, 3, 4, 5 +b = 2, 3, 4, 5, 12, 13, 14, 15 +validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15 + +aarch64 = xtn2 +generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t +generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t + +/// Negate +name = vneg +fn = simd_neg +a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8 +validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8 + +aarch64 = neg +generate int64x*_t + +arm = vneg.s +generate int*_t + +/// Negate +name = vneg +multi_fn = a.wrapping_neg() +a = 1 +validate -1 + +aarch64 = neg +generate i64 + +/// Negate +name = vneg +fn = simd_neg +a = 0., 1., -1., 2., -2., 3., -3., 4. +validate 0., -1., 1., -2., 2., -3., 3., -4. + +aarch64 = fneg +generate float64x*_t + +arm = vneg.s +generate float*_t + +/// Signed saturating negate +name = vqneg +a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7 +validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7 +link-arm = vqneg._EXT_ +link-aarch64 = sqneg._EXT_ + +aarch64 = sqneg +generate int64x*_t + +arm = vqneg.s +generate int*_t + +/// Signed saturating negate +name = vqneg +multi_fn = simd_extract, {vqneg-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = 1 +validate -1 + +aarch64 = sqneg +generate i8, i16, i32, i64 + +/// Saturating subtract +name = vqsub +a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 + +arm = vqsub.s +aarch64 = uqsub +link-arm = llvm.usub.sat._EXT_ +link-aarch64 = uqsub._EXT_ +generate uint*_t, uint64x*_t + +arm = vqsub.s +aarch64 = sqsub +link-arm = llvm.ssub.sat._EXT_ +link-aarch64 = sqsub._EXT_ +generate int*_t, int64x*_t + +/// Saturating subtract +name = vqsub +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0 +a = 42 +b = 1 +validate 41 + +aarch64 = sqsub +generate i8, i16 +aarch64 = uqsub +generate u8, u16 + +/// Saturating subtract +name = vqsub +a = 42 +b = 1 +validate 41 + +aarch64 = uqsub +link-aarch64 = uqsub._EXT_ +generate u32, u64 + +aarch64 = sqsub +link-aarch64 = sqsub._EXT_ +generate i32, i64 + +/// Halving add +name = vhadd +a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29 + +arm = vhadd.s +aarch64 = uhadd +link-aarch64 = uhadd._EXT_ +link-arm = vhaddu._EXT_ +generate uint*_t + +arm = vhadd.s +aarch64 = shadd +link-aarch64 = shadd._EXT_ +link-arm = vhadds._EXT_ +generate int*_t + +/// Reverse bit order +name = vrbit +a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 + +aarch64 = rbit +link-aarch64 = rbit._EXT_ + +generate int8x8_t, int8x16_t + +/// Reverse bit order +name = vrbit +multi_fn = transmute, {vrbit-signed-noext, transmute(a)} +a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 + +aarch64 = rbit + +generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t + +/// Rounding halving add +name = vrhadd +a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29 + +arm = vrhadd.s +aarch64 = urhadd +link-arm = vrhaddu._EXT_ +link-aarch64 = urhadd._EXT_ +generate uint*_t + +arm = vrhadd.s +aarch64 = srhadd +link-arm = vrhadds._EXT_ +link-aarch64 = srhadd._EXT_ +generate int*_t + +/// Floating-point round to integral exact, using current rounding mode +name = vrndx +a = -1.5, 0.5, 1.5, 2.5 +validate -2.0, 0.0, 2.0, 2.0 + +aarch64 = frintx +link-aarch64 = llvm.rint._EXT_ +generate float*_t, float64x*_t + +/// Floating-point round to integral, to nearest with ties to away +name = vrnda +a = -1.5, 0.5, 1.5, 2.5 +validate -2.0, 1.0, 2.0, 3.0 + +aarch64 = frinta +link-aarch64 = llvm.round._EXT_ +generate float*_t, float64x*_t + +/// Floating-point round to integral, to nearest with ties to even +name = vrndn +a = -1.5, 0.5, 1.5, 2.5 +validate -2.0, 0.0, 2.0, 2.0 + +link-aarch64 = frintn._EXT_ +aarch64 = frintn +generate float64x*_t + +target = fp-armv8 +arm = vrintn +link-arm = vrintn._EXT_ +generate float*_t + +/// Floating-point round to integral, to nearest with ties to even +name = vrndn +a = -1.5 +validate -2.0 + +aarch64 = frintn +link-aarch64 = llvm.roundeven._EXT_ +generate f32 + +/// Floating-point round to integral, toward minus infinity +name = vrndm +a = -1.5, 0.5, 1.5, 2.5 +validate -2.0, 0.0, 1.0, 2.0 + +aarch64 = frintm +link-aarch64 = llvm.floor._EXT_ +generate float*_t, float64x*_t + +/// Floating-point round to integral, toward plus infinity +name = vrndp +a = -1.5, 0.5, 1.5, 2.5 +validate -1.0, 1.0, 2.0, 3.0 + +aarch64 = frintp +link-aarch64 = llvm.ceil._EXT_ +generate float*_t, float64x*_t + +/// Floating-point round to integral, toward zero +name = vrnd +a = -1.5, 0.5, 1.5, 2.5 +validate -1.0, 0.0, 1.0, 2.0 + +aarch64 = frintz +link-aarch64 = llvm.trunc._EXT_ +generate float*_t, float64x*_t + +/// Floating-point round to integral, using current rounding mode +name = vrndi +a = -1.5, 0.5, 1.5, 2.5 +validate -2.0, 0.0, 2.0, 2.0 + +aarch64 = frinti +link-aarch64 = llvm.nearbyint._EXT_ +generate float*_t, float64x*_t + +/// Saturating add +name = vqadd +a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 + +arm = vqadd.s +aarch64 = uqadd +link-arm = llvm.uadd.sat._EXT_ +link-aarch64 = uqadd._EXT_ +generate uint*_t, uint64x*_t + +arm = vqadd.s +aarch64 = sqadd +link-arm = llvm.sadd.sat._EXT_ +link-aarch64 = sqadd._EXT_ +generate int*_t, int64x*_t + +/// Saturating add +name = vqadd +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0 +a = 42 +b = 1 +validate 43 + +aarch64 = sqadd +generate i8, i16 +aarch64 = uqadd +generate u8, u16 + +/// Saturating add +name = vqadd +a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 + +aarch64 = uqadd +link-aarch64 = uqadd._EXT_ +generate u32, u64 + +aarch64 = sqadd +link-aarch64 = sqadd._EXT_ +generate i32, i64 + +/// Load multiple single-element structures to one, two, three, or four registers +name = vld1 +out-suffix +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +load_fn + +aarch64 = ld1 +link-aarch64 = ld1x2._EXT2_ +arm = vld1 +link-arm = vld1x2._EXT2_ +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t + +link-aarch64 = ld1x3._EXT2_ +link-arm = vld1x3._EXT2_ +generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t +generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t + +link-aarch64 = ld1x4._EXT2_ +link-arm = vld1x4._EXT2_ +generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t +generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t + +/// Load multiple single-element structures to one, two, three, or four registers +name = vld1 +out-suffix +multi_fn = transmute, {vld1-outsigned-noext, transmute(a)} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 + +load_fn +aarch64 = ld1 +arm = vld1 +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t +generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t +generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t +generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t +generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t +generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t +generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t +generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t +generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t +target = aes +generate *const p64:poly64x1x2_t +arm = nop +generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t +generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t +/// Load multiple single-element structures to one, two, three, or four registers +name = vld1 +out-suffix +a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. +validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. +load_fn + +aarch64 = ld1 +link-aarch64 = ld1x2._EXT2_ +generate *const f64:float64x1x2_t, *const f64:float64x2x2_t + +link-aarch64 = ld1x3._EXT2_ +generate *const f64:float64x1x3_t, *const f64:float64x2x3_t + +link-aarch64 = ld1x4._EXT2_ +generate *const f64:float64x1x4_t, *const f64:float64x2x4_t + +arm = vld1 +link-aarch64 = ld1x2._EXT2_ +link-arm = vld1x2._EXT2_ +generate *const f32:float32x2x2_t, *const f32:float32x4x2_t + +link-aarch64 = ld1x3._EXT2_ +link-arm = vld1x3._EXT2_ +generate *const f32:float32x2x3_t, *const f32:float32x4x3_t + +link-aarch64 = ld1x4._EXT2_ +link-arm = vld1x4._EXT2_ +generate *const f32:float32x2x4_t, *const f32:float32x4x4_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-nox +a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +load_fn +arm-aarch64-separate + +aarch64 = ld2 +link-aarch64 = ld2._EXTv2_ +generate *const i64:int64x2x2_t + +arm = vld2 +link-arm = vld2._EXTpi82_ +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t +generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t +arm = nop +aarch64 = nop +generate *const i64:int64x1x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-nox +multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)} +a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +load_fn + +aarch64 = ld2 +generate *const u64:uint64x2x2_t +target = aes +generate *const p64:poly64x2x2_t + +target = default +arm = vld2 +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t +generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t +generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t +arm = nop +aarch64 = nop +generate *const u64:uint64x1x2_t +target = aes +generate *const p64:poly64x1x2_t + + +/// Load multiple 2-element structures to two registers +name = vld2 +out-nox +a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. +validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. +load_fn +arm-aarch64-separate + +aarch64 = nop +link-aarch64 = ld2._EXTv2_ +generate *const f64:float64x1x2_t +aarch64 = ld2 +generate *const f64:float64x2x2_t + +arm = vld2 +link-arm = vld2._EXTpi82_ +generate *const f32:float32x2x2_t, *const f32:float32x4x2_t + +/// Load single 2-element structure and replicate to all lanes of two registers +name = vld2 +out-dup-nox +a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn +arm-aarch64-separate + +aarch64 = ld2r +link-aarch64 = ld2r._EXT2_ +generate *const i64:int64x2x2_t + +arm = vld2 +link-arm = vld2dup._EXTpi82_ +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t +generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t +arm = nop +generate *const i64:int64x1x2_t + +/// Load single 2-element structure and replicate to all lanes of two registers +name = vld2 +out-dup-nox +multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)} +a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn + +aarch64 = ld2r +generate *const u64:uint64x2x2_t +target = aes +generate *const p64:poly64x2x2_t + +target = default +arm = vld2 +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t +generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t +generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t +arm = nop +generate *const u64:uint64x1x2_t +target = aes +generate *const p64:poly64x1x2_t + +/// Load single 2-element structure and replicate to all lanes of two registers +name = vld2 +out-dup-nox +a = 0., 1., 1., 2., 3., 1., 4., 3., 5. +validate 1., 1., 1., 1., 1., 1., 1., 1. +load_fn +arm-aarch64-separate + +aarch64 = ld2r +link-aarch64 = ld2r._EXT2_ +generate *const f64:float64x1x2_t, *const f64:float64x2x2_t + +arm = vld2 +link-arm = vld2dup._EXTpi82_ +generate *const f32:float32x2x2_t, *const f32:float32x4x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +n = 0 +validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +load_fn +arm-aarch64-separate + +aarch64 = ld2 +const-aarch64 = LANE +link-aarch64 = ld2lane._EXTpi82_ +generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t + +arm = vld2 +const-arm = LANE +link-arm = vld2lane._EXTpi82_ +generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t +generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vld2-outsignedlanenox-::<LANE>, transmute(a), transmute(b)} +constn = LANE +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +n = 0 +validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +load_fn + +aarch64 = ld2 +const-aarch64 = LANE + +target = aes +generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t + +target = default +generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t +generate *const p8:poly8x16x2_t:poly8x16x2_t + +arm = vld2 +const-arm = LANE +generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t +generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t +generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t +generate *const p16:poly16x8x2_t:poly16x8x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0., 1., 2., 3., 4., 5., 6., 7., 8. +b = 0., 2., 2., 14., 2., 16., 17., 18. +n = 0 +validate 1., 2., 2., 14., 2., 16., 17., 18. +load_fn +arm-aarch64-separate + +aarch64 = ld2 +const-aarch64 = LANE +link-aarch64 = ld2lane._EXTpi82_ +generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t + +arm = vld2 +const-arm = LANE +link-arm = vld2lane._EXTpi82_ +generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t + +/// Load multiple 3-element structures to three registers +name = vld3 +out-nox +a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 +validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 +load_fn +arm-aarch64-separate + +aarch64 = ld3 +link-aarch64 = ld3._EXTv2_ +generate *const i64:int64x2x3_t + +arm = vld3 +link-arm = vld3._EXTpi82_ +generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t +generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t +arm = nop +aarch64 = nop +generate *const i64:int64x1x3_t + +/// Load multiple 3-element structures to three registers +name = vld3 +out-nox +multi_fn = transmute, {vld3-outsignednox-noext, transmute(a)} +a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 +validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 +load_fn + +aarch64 = ld3 +generate *const u64:uint64x2x3_t +target = aes +generate *const p64:poly64x2x3_t + +target = default +arm = vld3 +generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t +generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t +generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t +arm = nop +aarch64 = nop +generate *const u64:uint64x1x3_t +target = aes +generate *const p64:poly64x1x3_t + +/// Load multiple 3-element structures to three registers +name = vld3 +out-nox +a = 0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8. +validate 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8. +load_fn +arm-aarch64-separate + +aarch64 = nop +link-aarch64 = ld3._EXTv2_ +generate *const f64:float64x1x3_t +aarch64 = ld3 +generate *const f64:float64x2x3_t + +arm = vld3 +link-arm = vld3._EXTpi82_ +generate *const f32:float32x2x3_t, *const f32:float32x4x3_t + +/// Load single 3-element structure and replicate to all lanes of three registers +name = vld3 +out-dup-nox +a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn +arm-aarch64-separate + +aarch64 = ld3r +link-aarch64 = ld3r._EXT2_ +generate *const i64:int64x2x3_t + +arm = vld3 +link-arm = vld3dup._EXTpi82_ +generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t +generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t +arm = nop +generate *const i64:int64x1x3_t + +/// Load single 3-element structure and replicate to all lanes of three registers +name = vld3 +out-dup-nox +multi_fn = transmute, {vld3-outsigneddupnox-noext, transmute(a)} +a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn + +aarch64 = ld3r +generate *const u64:uint64x2x3_t +target = aes +generate *const p64:poly64x2x3_t + +target = default +arm = vld3 +generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t +generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t +generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t +arm = nop +generate *const u64:uint64x1x3_t +target = aes +generate *const p64:poly64x1x3_t + +/// Load single 3-element structure and replicate to all lanes of three registers +name = vld3 +out-dup-nox +a = 0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5. +validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. +load_fn +arm-aarch64-separate + +aarch64 = ld3r +link-aarch64 = ld3r._EXT2_ +generate *const f64:float64x1x3_t, *const f64:float64x2x3_t + +arm = vld3 +link-arm = vld3dup._EXTpi82_ +generate *const f32:float32x2x3_t, *const f32:float32x4x3_t + +/// Load multiple 3-element structures to two registers +name = vld3 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +n = 0 +validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +load_fn +arm-aarch64-separate + +aarch64 = ld3 +const-aarch64 = LANE +link-aarch64 = ld3lane._EXTpi82_ +generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t + +arm = vld3 +const-arm = LANE +link-arm = vld3lane._EXTpi82_ +generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t +generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t + +/// Load multiple 3-element structures to three registers +name = vld3 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vld3-outsignedlanenox-::<LANE>, transmute(a), transmute(b)} +constn = LANE +a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +n = 0 +validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +load_fn + +aarch64 = ld3 +const-aarch64 = LANE +target = aes +generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t +target = default +generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t + +arm = vld3 +const-arm = LANE +generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t +generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t +generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t +generate *const p16:poly16x8x3_t:poly16x8x3_t + +/// Load multiple 3-element structures to three registers +name = vld3 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8. +b = 0., 2., 2., 14., 9., 16., 17., 18., 5., 6., 7., 8. +n = 0 +validate 1., 2., 2., 14., 2., 16., 17., 18., 2., 6., 7., 8. +load_fn +arm-aarch64-separate + +aarch64 = ld3 +const-aarch64 = LANE +link-aarch64 = ld3lane._EXTpi82_ +generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t + +arm = vld3 +const-arm = LANE +link-arm = vld3lane._EXTpi82_ +generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t + +/// Load multiple 4-element structures to four registers +name = vld4 +out-nox +a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +load_fn +arm-aarch64-separate + +aarch64 = ld4 +link-aarch64 = ld4._EXTv2_ +generate *const i64:int64x2x4_t + +arm = vld4 +link-arm = vld4._EXTpi82_ +generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t +generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t +aarch64 = nop +arm = nop +generate *const i64:int64x1x4_t + +/// Load multiple 4-element structures to four registers +name = vld4 +out-nox +multi_fn = transmute, {vld4-outsignednox-noext, transmute(a)} +a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +load_fn + +aarch64 = ld4 +generate *const u64:uint64x2x4_t +target = aes +generate *const p64:poly64x2x4_t + +target = default +arm = vld4 +generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t +generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t +generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t +aarch64 = nop +arm = nop +generate *const u64:uint64x1x4_t +target = aes +generate *const p64:poly64x1x4_t + +/// Load multiple 4-element structures to four registers +name = vld4 +out-nox +a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16. +validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 15., 6., 8., 8., 16. +load_fn +arm-aarch64-separate + +aarch64 = nop +link-aarch64 = ld4._EXTv2_ +generate *const f64:float64x1x4_t +aarch64 = ld4 +generate *const f64:float64x2x4_t + +arm = vld4 +link-arm = vld4._EXTpi82_ +generate *const f32:float32x2x4_t, *const f32:float32x4x4_t + +/// Load single 4-element structure and replicate to all lanes of four registers +name = vld4 +out-dup-nox +a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn +arm-aarch64-separate + +aarch64 = ld4r +link-aarch64 = ld4r._EXT2_ +generate *const i64:int64x2x4_t + +arm = vld4 +link-arm = vld4dup._EXTpi82_ +generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t +generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t +arm = nop +generate *const i64:int64x1x4_t + +/// Load single 4-element structure and replicate to all lanes of four registers +name = vld4 +out-dup-nox +multi_fn = transmute, {vld4-outsigneddupnox-noext, transmute(a)} +a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn + +aarch64 = ld4r +generate *const u64:uint64x2x4_t +target = aes +generate *const p64:poly64x2x4_t + +target = default +arm = vld4 +generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t +generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t +generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t +arm = nop +generate *const u64:uint64x1x4_t +target = aes +generate *const p64:poly64x1x4_t + +/// Load single 4-element structure and replicate to all lanes of four registers +name = vld4 +out-dup-nox +a = 0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5., 9., 4., 3., 5. +validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. +load_fn +arm-aarch64-separate + +aarch64 = ld4r +link-aarch64 = ld4r._EXT2_ +generate *const f64:float64x1x4_t, *const f64:float64x2x4_t + +arm = vld4 +link-arm = vld4dup._EXTpi82_ +generate *const f32:float32x2x4_t, *const f32:float32x4x4_t + +/// Load multiple 4-element structures to four registers +name = vld4 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 +b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 +n = 0 +validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 +load_fn +arm-aarch64-separate + +aarch64 = ld4 +const-aarch64 = LANE +link-aarch64 = ld4lane._EXTpi82_ +generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t + +arm = vld4 +const-arm = LANE +link-arm = vld4lane._EXTpi82_ +generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t +generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t + +/// Load multiple 4-element structures to four registers +name = vld4 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vld4-outsignedlanenox-::<LANE>, transmute(a), transmute(b)} +constn = LANE +a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 +b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 +n = 0 +validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 +load_fn + +aarch64 = ld4 +const-aarch64 = LANE +target = aes +generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t +target = default +generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t + +arm = vld4 +const-arm = LANE +generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t +generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t +generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t +generate *const p16:poly16x8x4_t:poly16x8x4_t + +/// Load multiple 4-element structures to four registers +name = vld4 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5. +b = 0., 2., 2., 2., 2., 16., 2., 18., 5., 6., 7., 8., 1., 4., 3., 5. +n = 0 +validate 1., 2., 2., 2., 2., 16., 2., 18., 2., 6., 7., 8., 2., 4., 3., 5. +load_fn +arm-aarch64-separate + +aarch64 = ld4 +const-aarch64 = LANE +link-aarch64 = ld4lane._EXTpi82_ +generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t + +arm = vld4 +const-arm = LANE +link-arm = vld4lane._EXTpi82_ +generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t + +/// Store multiple single-element structures from one, two, three, or four registers +name = vst1 +in1-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = *a, {simd_extract, b, LANE as u32} +constn = LANE +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +n = 0 +validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn + +aarch64 = nop +arm = nop +generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void +generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void +generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void +generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void +generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void +target = aes +generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void + +/// Store multiple single-element structures from one, two, three, or four registers +name = vst1 +in1-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = *a, {simd_extract, b, LANE as u32} +constn = LANE +a = 0., 1., 2., 3., 4., 5., 6., 7., 8. +n = 0 +validate 1., 0., 0., 0., 0., 0., 0., 0. +store_fn + +aarch64 = nop +generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void + +arm = nop +generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void + +/// Store multiple single-element structures from one, two, three, or four registers +name = vst1 +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +store_fn +arm-aarch64-separate + +aarch64 = st1 +link-aarch64 = st1x2._EXT3_ +arm = vst1 +link-arm = vst1x2._EXTr3_ +generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void +generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void + +link-aarch64 = st1x3._EXT3_ +link-arm = vst1x3._EXTr3_ +generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void +generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void + +link-aarch64 = st1x4._EXT3_ +link-arm = vst1x4._EXTr3_ +generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void +generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void + +/// Store multiple single-element structures to one, two, three, or four registers +name = vst1 +multi_fn = vst1-signed-noext, transmute(a), transmute(b) +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 + +store_fn +aarch64 = st1 +arm = vst1 +generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void +generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void +generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void +generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void +generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void +generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void +generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void +generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void +generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void +generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void +target = aes +generate *mut p64:poly64x1x2_t:void +arm = nop +generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x1x4_t:void +generate *mut p64:poly64x2x2_t:void, *mut p64:poly64x2x3_t:void, *mut p64:poly64x2x4_t:void + +/// Store multiple single-element structures to one, two, three, or four registers +name = vst1 +a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. +validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. +store_fn +arm-aarch64-separate + +aarch64 = st1 +link-aarch64 = st1x2._EXT3_ +generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void + +link-aarch64 = st1x3._EXT3_ +generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void + +link-aarch64 = st1x4._EXT3_ +generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void + +arm = vst1 +link-aarch64 = st1x2._EXT3_ +link-arm = vst1x2._EXTr3_ +generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void + +link-aarch64 = st1x3._EXT3_ +link-arm = vst1x3._EXTr3_ +generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void + +link-aarch64 = st1x4._EXT3_ +link-arm = vst1x4._EXTr3_ +generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void + +/// Store multiple 2-element structures from two registers +name = vst2 +in1-nox +a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +store_fn +arm-aarch64-separate + +aarch64 = st2 +link-aarch64 = st2._EXTpi8_ +generate *mut i64:int64x2x2_t:void + +arm = vst2 +link-arm = vst2._EXTpi8r_ +generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void +generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void +arm = nop +aarch64 = nop +generate *mut i64:int64x1x2_t:void + +/// Store multiple 2-element structures from two registers +name = vst2 +multi_fn = transmute, {vst2-in1signednox-noext, transmute(a), transmute(b)} +in1-nox +a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +store_fn + +aarch64 = st2 +generate *mut u64:uint64x2x2_t:void +target = aes +generate *mut p64:poly64x2x2_t:void + +target = default +arm = vst2 +generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void +generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void +generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void +arm = nop +aarch64 = nop +generate *mut u64:uint64x1x2_t:void +target = aes +generate *mut p64:poly64x1x2_t:void + +/// Store multiple 2-element structures from two registers +name = vst2 +in1-nox +a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. +validate 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. +store_fn +arm-aarch64-separate + +aarch64 = st1 +link-aarch64 = st2._EXTpi8_ +generate *mut f64:float64x1x2_t:void +aarch64 = st2 +generate *mut f64:float64x2x2_t:void + +arm = vst2 +link-arm = vst2._EXTpi8r_ +generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void + +/// Store multiple 2-element structures from two registers +name = vst2 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +n = 0 +validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn +arm-aarch64-separate + +aarch64 = st2 +link-aarch64 = st2lane._EXTpi8_ +const-aarch64 = LANE +generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void + +arm = vst2 +link-arm = vst2lane._EXTpi8r_ +const-arm = LANE +generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void +generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void + +/// Store multiple 2-element structures from two registers +name = vst2 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vst2-in1signedlanenox-::<LANE>, transmute(a), transmute(b)} +a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +n = 0 +validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn + +aarch64 = st2 +generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void +target = aes +generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void + +target = default +arm = vst2 +generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void +generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void +generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void + +/// Store multiple 2-element structures from two registers +name = vst2 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. +n = 0 +validate 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. +store_fn +arm-aarch64-separate + +aarch64 = st2 +link-aarch64 = st2lane._EXTpi8_ +const-aarch64 = LANE +generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void + +arm = vst2 +link-arm = vst2lane._EXTpi8r_ +const-arm = LANE +generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void + +/// Store multiple 3-element structures from three registers +name = vst3 +in1-nox +a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 +validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 +store_fn +arm-aarch64-separate + +aarch64 = st3 +link-aarch64 = st3._EXTpi8_ +generate *mut i64:int64x2x3_t:void + +arm = vst3 +link-arm = vst3._EXTpi8r_ +generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void +generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void +arm = nop +aarch64 = nop +generate *mut i64:int64x1x3_t:void + +/// Store multiple 3-element structures from three registers +name = vst3 +multi_fn = transmute, {vst3-in1signednox-noext, transmute(a), transmute(b)} +in1-nox +a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 +validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 +store_fn + +aarch64 = st3 +generate *mut u64:uint64x2x3_t:void +target = aes +generate *mut p64:poly64x2x3_t:void + +target = default +arm = vst3 +generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void +generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void +generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void +arm = nop +aarch64 = nop +generate *mut u64:uint64x1x3_t:void +target = aes +generate *mut p64:poly64x1x3_t:void + +/// Store multiple 3-element structures from three registers +name = vst3 +in1-nox +a = 0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8., 13., 14., 15., 16 +validate 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8., 2., 13., 13., 4. +store_fn +arm-aarch64-separate + +aarch64 = nop +link-aarch64 = st3._EXTpi8_ +generate *mut f64:float64x1x3_t:void +aarch64 = st3 +generate *mut f64:float64x2x3_t:void + +arm = vst3 +link-arm = vst3._EXTpi8r_ +generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void + +/// Store multiple 3-element structures from three registers +name = vst3 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 +n = 0 +validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn +arm-aarch64-separate + +aarch64 = st3 +link-aarch64 = st3lane._EXTpi8_ +const-aarch64 = LANE +generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void + +arm = vst3 +link-arm = vst3lane._EXTpi8r_ +const-arm = LANE +generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void +generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void + +/// Store multiple 3-element structures from three registers +name = vst3 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vst3-in1signedlanenox-::<LANE>, transmute(a), transmute(b)} +a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 +n = 0 +validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn + +aarch64 = st3 +generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void +target = aes +generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void + +target = default +arm = vst3 +generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void +generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void +generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void + +/// Store multiple 3-element structures from three registers +name = vst3 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. +n = 0 +validate 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. +store_fn +arm-aarch64-separate + +aarch64 = st3 +link-aarch64 = st3lane._EXTpi8_ +const-aarch64 = LANE +generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void + +arm = vst3 +link-arm = vst3lane._EXTpi8r_ +const-arm = LANE +generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void + +/// Store multiple 4-element structures from four registers +name = vst4 +in1-nox +a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +store_fn +arm-aarch64-separate + +aarch64 = st4 +link-aarch64 = st4._EXTpi8_ +generate *mut i64:int64x2x4_t:void + +arm = vst4 +link-arm = vst4._EXTpi8r_ +generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void +generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void +arm = nop +aarch64 = nop +generate *mut i64:int64x1x4_t:void + +/// Store multiple 4-element structures from four registers +name = vst4 +multi_fn = transmute, {vst4-in1signednox-noext, transmute(a), transmute(b)} +in1-nox +a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +store_fn + +aarch64 = st4 +generate *mut u64:uint64x2x4_t:void +target = aes +generate *mut p64:poly64x2x4_t:void + +target = default +arm = vst4 +generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void +generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void +generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void +arm = nop +aarch64 = nop +generate *mut u64:uint64x1x4_t:void +target = aes +generate *mut p64:poly64x1x4_t:void + +/// Store multiple 4-element structures from four registers +name = vst4 +in1-nox +a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. +validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. +store_fn +arm-aarch64-separate + +aarch64 = nop +link-aarch64 = st4._EXTpi8_ +generate *mut f64:float64x1x4_t:void +aarch64 = st4 +generate *mut f64:float64x2x4_t:void + +arm = vst4 +link-arm = vst4._EXTpi8r_ +generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void + +/// Store multiple 4-element structures from four registers +name = vst4 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +n = 0 +validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn +arm-aarch64-separate + +aarch64 = st4 +link-aarch64 = st4lane._EXTpi8_ +const-aarch64 = LANE +generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void + +arm = vst4 +link-arm = vst4lane._EXTpi8r_ +const-arm = LANE +generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void +generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void + +/// Store multiple 4-element structures from four registers +name = vst4 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vst4-in1signedlanenox-::<LANE>, transmute(a), transmute(b)} +a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 +n = 0 +validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +store_fn + +aarch64 = st4 +generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void +target = aes +generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void + +target = default +arm = vst4 +generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void +generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void +generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void + +/// Store multiple 4-element structures from four registers +name = vst4 +in1-lane-nox +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. +n = 0 +validate 1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. +store_fn +arm-aarch64-separate + +aarch64 = st4 +link-aarch64 = st4lane._EXTpi8_ +const-aarch64 = LANE +generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void + +arm = vst4 +link-arm = vst4lane._EXTpi8r_ +const-arm = LANE +generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void + +/// Dot product index form with signed and unsigned integers +name = vsudot +out-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = simd_shuffle-in_len-!, c:unsigned, c, c, {base-4-LANE} +multi_fn = vsudot-outlane-_, a, b, c +a = 1, 2, 1, 2 +b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +n = 0 +validate 31, 72, 31, 72 +target = dotprod + +aarch64 = sudot +link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t +// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot +//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t +link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t +// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot +//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t + +/// Multiply +name = vmul +a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 +arm = vmul. +aarch64 = mul +fn = simd_mul +generate int*_t, uint*_t + +/// Polynomial multiply +name = vmul +a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48 + +aarch64 = pmul +link-aarch64 = pmul._EXT_ +arm = vmul +link-arm = vmulp._EXT_ +generate poly8x8_t, poly8x16_t + +/// Multiply +name = vmul +fn = simd_mul +a = 1.0, 2.0, 1.0, 2.0 +b = 2.0, 3.0, 4.0, 5.0 +validate 2.0, 6.0, 4.0, 10.0 + +aarch64 = fmul +generate float64x*_t + +arm = vmul. +generate float*_t + +/// Vector multiply by scalar +name = vmul +out-n-suffix +multi_fn = simd_mul, a, {vdup-nout-noext, b} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2 +validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 + +arm = vmul +aarch64 = mul +generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t +generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t + +/// Vector multiply by scalar +name = vmul +out-n-suffix +multi_fn = simd_mul, a, {vdup-nout-noext, b} +a = 1., 2., 3., 4. +b = 2. +validate 2., 4., 6., 8. + +aarch64 = fmul +generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t + +arm = vmul +generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t + +/// Multiply +name = vmul +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 + +aarch64 = mul +arm = vmul +generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t +generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t +generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t + +/// Floating-point multiply +name = vmul +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}} +a = 1., 2., 3., 4. +b = 2., 0., 0., 0. +n = 0 +validate 2., 4., 6., 8. + +aarch64 = fmul +generate float64x1_t, float64x1_t:float64x2_t:float64x1_t + +/// Floating-point multiply +name = vmul +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} +a = 1., 2., 3., 4. +b = 2., 0., 0., 0. +n = 0 +validate 2., 4., 6., 8. + +aarch64 = fmul +generate float64x2_t:float64x1_t:float64x2_t, float64x2_t + +arm = vmul +generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Floating-point multiply +name = vmuls_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_extract, b:f32, b, LANE as u32 +multi_fn = a * b +a = 1. +b = 2., 0., 0., 0. +n = 0 +validate 2. +aarch64 = fmul +generate f32:float32x2_t:f32, f32:float32x4_t:f32 + +/// Floating-point multiply +name = vmuld_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_extract, b:f64, b, LANE as u32 +multi_fn = a * b +a = 1. +b = 2., 0. +n = 0 +validate 2. +aarch64 = fmul +generate f64:float64x1_t:f64, f64:float64x2_t:f64 + +/// Signed multiply long +name = vmull +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 + +arm = vmull.s +aarch64 = smull +link-arm = vmulls._EXT_ +link-aarch64 = smull._EXT_ +generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Signed multiply long +name = vmull_high +no-q +multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = vmull-noqself-noext, a, b +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 9, 20, 11, 24, 13, 28, 15, 32 + +aarch64 = smull2 +generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t + +/// Unsigned multiply long +name = vmull +a = 1, 2, 3, 4, 5, 6, 7, 8 +b = 1, 2, 1, 2, 1, 2, 1, 2 +validate 1, 4, 3, 8, 5, 12, 7, 16 + +arm = vmull.s +aarch64 = umull +link-arm = vmullu._EXT_ +link-aarch64 = umull._EXT_ +generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t + +/// Unsigned multiply long +name = vmull_high +no-q +multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = vmull-noqself-noext, a, b +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 9, 20, 11, 24, 13, 28, 15, 32 + +aarch64 = umull2 +generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t + +/// Polynomial multiply long +name = vmull +a = 1, 2, 3, 4, 5, 6, 7, 8 +b = 1, 3, 1, 3, 1, 3, 1, 3 +validate 1, 6, 3, 12, 5, 10, 7, 24 + +arm = vmull.s +aarch64 = pmull +link-arm = vmullp._EXT_ +link-aarch64 = pmull._EXT_ +generate poly8x8_t:poly8x8_t:poly16x8_t + +/// Polynomial multiply long +name = vmull +no-q +a = 15 +b = 3 +validate 17 +target = aes + +aarch64 = pmull +link-aarch64 = pmull64:p64:p64:p64:int8x16_t +// Because of the support status of llvm, vmull_p64 is currently only available on arm +// arm = vmull +// link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t +generate p64:p64:p128 + + +/// Polynomial multiply long +name = vmull_high +no-q +multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} +multi_fn = vmull-noqself-noext, a, b +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 9, 30, 11, 20, 13, 18, 15, 48 + +aarch64 = pmull +generate poly8x16_t:poly8x16_t:poly16x8_t + +/// Polynomial multiply long +name = vmull_high +no-q +multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1} +a = 1, 15 +b = 1, 3 +validate 17 +target = aes + +aarch64 = pmull +generate poly64x2_t:poly64x2_t:p128 + +/// Vector long multiply with scalar +name = vmull_n +no-q +multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} +a = 1, 2, 3, 4, 5, 6, 7, 8 +b = 2 +validate 2, 4, 6, 8, 10, 12, 14, 16 + +arm = vmull +aarch64 = smull +generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t +aarch64 = umull +generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t + +/// Vector long multiply by scalar +name = vmull_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 + +arm = vmull +aarch64 = smull +generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t +generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t +aarch64 = umull +generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t +generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t + +/// Multiply long +name = vmull_high_n +no-q +multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b} +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2 +validate 18, 20, 22, 24, 26, 28, 30, 32 + +aarch64 = smull2 +generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t +aarch64 = umull2 +generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t + +/// Multiply long +name = vmull_high_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 18, 20, 22, 24, 26, 28, 30, 32 + +aarch64 = smull2 +generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t +generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t +aarch64 = umull2 +generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t +generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t + +/// Floating-point multiply extended +name = vmulx +a = 1., 2., 3., 4. +b = 2., 2., 2., 2. +validate 2., 4., 6., 8. + +aarch64 = fmulx +link-aarch64 = fmulx._EXT_ +generate float*_t, float64x*_t + +/// Floating-point multiply extended +name = vmulx +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}} +a = 1. +b = 2., 0. +n = 0 +validate 2. + +aarch64 = fmulx +generate float64x1_t, float64x1_t:float64x2_t:float64x1_t + +/// Floating-point multiply extended +name = vmulx +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} +a = 1., 2., 3., 4. +b = 2., 0., 0., 0. +n = 0 +validate 2., 4., 6., 8. + +aarch64 = fmulx +generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t +generate float64x2_t:float64x1_t:float64x2_t, float64x2_t + +/// Floating-point multiply extended +name = vmulx +a = 2. +b = 3. +validate 6. + +aarch64 = fmulx +link-aarch64 = fmulx._EXT_ +generate f32, f64 + +/// Floating-point multiply extended +name = vmulx +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32} + +a = 2. +b = 3., 0., 0., 0. +n = 0 +validate 6. + +aarch64 = fmulx +generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64 + +/// Floating-point fused Multiply-Add to accumulator(vector) +name = vfma +multi_fn = vfma-self-_, b, c, a +a = 8.0, 18.0, 12.0, 10.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 2.0, 3.0, 4.0, 5.0 +validate 20.0, 30.0, 40.0, 50.0 + +link-aarch64 = llvm.fma._EXT_ +aarch64 = fmadd +generate float64x1_t +aarch64 = fmla +generate float64x2_t + +target = vfp4 +arm = vfma +link-arm = llvm.fma._EXT_ +generate float*_t + +/// Floating-point fused Multiply-Add to accumulator(vector) +name = vfma +n-suffix +multi_fn = vfma-self-noext, a, b, {vdup-nselfvfp4-noext, c} +a = 2.0, 3.0, 4.0, 5.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 8.0 +validate 50.0, 35.0, 60.0, 69.0 + +aarch64 = fmadd +generate float64x1_t:float64x1_t:f64:float64x1_t +aarch64 = fmla +generate float64x2_t:float64x2_t:f64:float64x2_t + +target = vfp4 +arm = vfma +generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t + +/// Floating-point fused multiply-add to accumulator +name = vfma +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} +a = 2., 3., 4., 5. +b = 6., 4., 7., 8. +c = 2., 0., 0., 0. +n = 0 +validate 14., 11., 18., 21. + +aarch64 = fmla +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t +aarch64 = fmadd +generate float64x1_t +aarch64 = fmla +generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t + +/// Floating-point fused multiply-add to accumulator +name = vfma +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = simd_extract, c:out_t, c, LANE as u32 +multi_fn = vfma-in2lane-_, b, c, a +a = 2. +b = 6. +c = 3., 0., 0., 0. +n = 0 +validate 20. + +aarch64 = fmla +link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32 +generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 +link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64 +aarch64 = fmadd +generate f64:f64:float64x1_t:f64 +aarch64 = fmla +generate f64:f64:float64x2_t:f64 + +/// Floating-point fused multiply-subtract from accumulator +name = vfms +multi_fn = simd_neg, b:in_t, b +multi_fn = vfma-self-noext, a, b, c +a = 20.0, 30.0, 40.0, 50.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 2.0, 3.0, 4.0, 5.0 +validate 8.0, 18.0, 12.0, 10.0 + +aarch64 = fmsub +generate float64x1_t +aarch64 = fmls +generate float64x2_t + +target = vfp4 +arm = vfms +generate float*_t + +/// Floating-point fused Multiply-subtract to accumulator(vector) +name = vfms +n-suffix +multi_fn = vfms-self-noext, a, b, {vdup-nselfvfp4-noext, c} +a = 50.0, 35.0, 60.0, 69.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 8.0 +validate 2.0, 3.0, 4.0, 5.0 + +aarch64 = fmsub +generate float64x1_t:float64x1_t:f64:float64x1_t +aarch64 = fmls +generate float64x2_t:float64x2_t:f64:float64x2_t + +target = vfp4 +arm = vfms +generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t + +/// Floating-point fused multiply-subtract to accumulator +name = vfms +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} +a = 14., 11., 18., 21. +b = 6., 4., 7., 8. +c = 2., 0., 0., 0. +n = 0 +validate 2., 3., 4., 5. + +aarch64 = fmls +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t +aarch64 = fmsub +generate float64x1_t +aarch64 = fmls +generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t + +/// Floating-point fused multiply-subtract to accumulator +name = vfms +in2-lane-suffixes +constn = LANE +multi_fn = vfma-in2lane-::<LANE>, a, -b, c +a = 14. +b = 6. +c = 2., 0., 0., 0. +n = 0 +validate 2. + +aarch64 = fmls +generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 +aarch64 = fmsub +generate f64:f64:float64x1_t:f64 +aarch64 = fmls +generate f64:f64:float64x2_t:f64 + +/// Divide +name = vdiv +fn = simd_div +a = 2.0, 6.0, 4.0, 10.0 +b = 1.0, 2.0, 1.0, 2.0 +validate 2.0, 3.0, 4.0, 5.0 + +aarch64 = fdiv +generate float*_t, float64x*_t + +/// Subtract +name = vsub +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 +arm = vsub. +aarch64 = sub +fn = simd_sub +generate int*_t, uint*_t, int64x*_t, uint64x*_t + +/// Subtract +name = vsub +fn = simd_sub +a = 1.0, 4.0, 3.0, 8.0 +b = 1.0, 2.0, 3.0, 4.0 +validate 0.0, 2.0, 0.0, 4.0 + +aarch64 = fsub +generate float64x*_t + +arm = vsub. +generate float*_t + +/// Subtract +name = vsub +multi_fn = a.wrapping_sub(b) +a = 3 +b = 2 +validate 1 + +aarch64 = nop +generate i64, u64 + +/// Add +name = vadd +multi_fn = a.wrapping_add(b) +a = 1 +b = 2 +validate 3 + +aarch64 = nop +generate i64, u64 + +/// Bitwise exclusive OR +name = vadd +multi_fn = simd_xor, a, b +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +validate 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17 + +aarch64 = nop +arm = nop +generate poly8x8_t, poly16x4_t, poly8x16_t, poly16x8_t, poly64x1_t, poly64x2_t + +/// Bitwise exclusive OR +name = vaddq +no-q +multi_fn = a ^ b +a = 16 +b = 1 +validate 17 + +aarch64 = nop +arm = nop +generate p128 + +/// Floating-point add across vector +name = vaddv +a = 1., 2., 0., 0. +validate 3. + +aarch64 = faddp +link-aarch64 = faddv._EXT2_._EXT_ +generate float32x2_t:f32, float32x4_t:f32, float64x2_t:f64 + +/// Signed Add Long across Vector +name = vaddlv +a = 1, 2, 3, 4 +validate 10 + +aarch64 = saddlv +link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ +generate int16x4_t:i32 + +/// Signed Add Long across Vector +name = vaddlv +a = 1, 2, 3, 4, 5, 6, 7, 8 +validate 36 + +aarch64 = saddlv +link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ +generate int16x8_t:i32 + +/// Signed Add Long across Vector +name = vaddlv +a = 1, 2 +validate 3 + +aarch64 = saddlp +link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ +generate int32x2_t:i64 + +/// Signed Add Long across Vector +name = vaddlv +a = 1, 2, 3, 4 +validate 10 + +aarch64 = saddlv +link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ +generate int32x4_t:i64 + +/// Unsigned Add Long across Vector +name = vaddlv +a = 1, 2, 3, 4 +validate 10 + +aarch64 = uaddlv +link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ +generate uint16x4_t:u32 + +/// Unsigned Add Long across Vector +name = vaddlv +a = 1, 2, 3, 4, 5, 6, 7, 8 +validate 36 + +aarch64 = uaddlv +link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ +generate uint16x8_t:u32 + +/// Unsigned Add Long across Vector +name = vaddlv +a = 1, 2 +validate 3 + +aarch64 = uaddlp +link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ +generate uint32x2_t:u64 + +/// Unsigned Add Long across Vector +name = vaddlv +a = 1, 2, 3, 4 +validate 10 + +aarch64 = uaddlv +link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ +generate uint32x4_t:u64 + +/// Subtract returning high narrow +name = vsubhn +no-q +multi_fn = fixed, c:in_t +multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)} +a = MAX, MIN, 1, 1, MAX, MIN, 1, 1 +b = 1, 0, 0, 0, 1, 0, 0, 0 +fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS +validate MAX, MIN, 0, 0, MAX, MIN, 0, 0 + +arm = vsubhn +aarch64 = subhn +generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t +generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t + +/// Subtract returning high narrow +name = vsubhn_high +no-q +multi_fn = vsubhn-noqself-noext, d:in_t0, b, c +multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len} +a = MAX, 0, MAX, 0, MAX, 0, MAX, 0 +b = MAX, 1, MAX, 1, MAX, 1, MAX, 1 +c = 1, 0, 1, 0, 1, 0, 1, 0 +validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0 + +arm = vsubhn +aarch64 = subhn2 +generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t +generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t + +/// Signed halving subtract +name = vhsub +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 + +arm = vhsub.s +aarch64 = uhsub +link-arm = vhsubu._EXT_ +link-aarch64 = uhsub._EXT_ +generate uint*_t + +arm = vhsub.s +aarch64 = shsub +link-arm = vhsubs._EXT_ +link-aarch64 = shsub._EXT_ +generate int*_t + +/// Signed Subtract Wide +name = vsubw +no-q +multi_fn = simd_sub, a, {simd_cast, b} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 +validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +arm = vsubw +aarch64 = ssubw +generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t + +/// Unsigned Subtract Wide +name = vsubw +no-q +multi_fn = simd_sub, a, {simd_cast, b} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 +validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +arm = vsubw +aarch64 = usubw +generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t + +/// Signed Subtract Wide +name = vsubw_high +no-q +multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_sub, a, {simd_cast, c} +a = 8, 9, 10, 12, 13, 14, 15, 16 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 +validate 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = ssubw +generate int16x8_t:int8x16_t:int16x8_t + +/// Signed Subtract Wide +name = vsubw_high +no-q +multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_sub, a, {simd_cast, c} +a = 8, 9, 10, 11 +b = 0, 1, 2, 3, 8, 9, 10, 11 +validate 0, 0, 0, 0 + +aarch64 = ssubw +generate int32x4_t:int16x8_t:int32x4_t + +/// Signed Subtract Wide +name = vsubw_high +no-q +multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3] +multi_fn = simd_sub, a, {simd_cast, c} +a = 8, 9 +b = 6, 7, 8, 9 +validate 0, 0 + +aarch64 = ssubw +generate int64x2_t:int32x4_t:int64x2_t + +/// Unsigned Subtract Wide +name = vsubw_high +no-q +multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_sub, a, {simd_cast, c} +a = 8, 9, 10, 11, 12, 13, 14, 15 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = usubw +generate uint16x8_t:uint8x16_t:uint16x8_t + +/// Unsigned Subtract Wide +name = vsubw_high +no-q +multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_sub, a, {simd_cast, c} +a = 8, 9, 10, 11 +b = 0, 1, 2, 3, 8, 9, 10, 11 +validate 0, 0, 0, 0 + +aarch64 = usubw +generate uint32x4_t:uint16x8_t:uint32x4_t + +/// Unsigned Subtract Wide +name = vsubw_high +no-q +multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3] +multi_fn = simd_sub, a, {simd_cast, c} +a = 8, 9 +b = 6, 7, 8, 9 +validate 0, 0 + +aarch64 = usubw +generate uint64x2_t:uint32x4_t:uint64x2_t + +/// Signed Subtract Long +name = vsubl +no-q +multi_fn = simd_cast, c:out_t, a +multi_fn = simd_cast, d:out_t, b +multi_fn = simd_sub, c, d + +a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +arm = vsubl +aarch64 = ssubl +generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Unsigned Subtract Long +name = vsubl +no-q +multi_fn = simd_cast, c:out_t, a +multi_fn = simd_cast, d:out_t, b +multi_fn = simd_sub, c, d + +a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +arm = vsubl +aarch64 = usubl +generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t + +/// Signed Subtract Long +name = vsubl_high +no-q +multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_cast, d:out_t, c +multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_cast, f:out_t, e +multi_fn = simd_sub, d, f + +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 +validate 6, 7, 8, 9, 10, 11, 12, 13 + +aarch64 = ssubl +generate int8x16_t:int8x16_t:int16x8_t + +/// Signed Subtract Long +name = vsubl_high +no-q +multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] +multi_fn = simd_cast, d:out_t, c +multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_cast, f:out_t, e +multi_fn = simd_sub, d, f + +a = 8, 9, 10, 11, 12, 13, 14, 15 +b = 6, 6, 6, 6, 8, 8, 8, 8 +validate 4, 5, 6, 7 + +aarch64 = ssubl +generate int16x8_t:int16x8_t:int32x4_t + +/// Signed Subtract Long +name = vsubl_high +no-q +multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] +multi_fn = simd_cast, d:out_t, c +multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3] +multi_fn = simd_cast, f:out_t, e +multi_fn = simd_sub, d, f + +a = 12, 13, 14, 15 +b = 6, 6, 8, 8 +validate 6, 7 + +aarch64 = ssubl +generate int32x4_t:int32x4_t:int64x2_t + +/// Unsigned Subtract Long +name = vsubl_high +no-q +multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_cast, d:out_t, c +multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_cast, f:out_t, e +multi_fn = simd_sub, d, f + +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 +validate 6, 7, 8, 9, 10, 11, 12, 13 + +aarch64 = usubl +generate uint8x16_t:uint8x16_t:uint16x8_t + +/// Unsigned Subtract Long +name = vsubl_high +no-q +multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] +multi_fn = simd_cast, d:out_t, c +multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_cast, f:out_t, e +multi_fn = simd_sub, d, f + +a = 8, 9, 10, 11, 12, 13, 14, 15 +b = 6, 6, 6, 6, 8, 8, 8, 8 +validate 4, 5, 6, 7 + +aarch64 = usubl +generate uint16x8_t:uint16x8_t:uint32x4_t + +/// Unsigned Subtract Long +name = vsubl_high +no-q +multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] +multi_fn = simd_cast, d:out_t, c +multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3] +multi_fn = simd_cast, f:out_t, e +multi_fn = simd_sub, d, f + +a = 12, 13, 14, 15 +b = 6, 6, 8, 8 +validate 6, 7 + +aarch64 = usubl +generate uint32x4_t:uint32x4_t:uint64x2_t + +/// Bit clear and exclusive OR +name = vbcax +a = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 +b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +validate 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 +target = sha3 + +aarch64 = bcax +link-aarch64 = llvm.aarch64.crypto.bcaxs._EXT_ +generate int8x16_t, int16x8_t, int32x4_t, int64x2_t +link-aarch64 = llvm.aarch64.crypto.bcaxu._EXT_ +generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t + +/// Floating-point complex add +name = vcadd_rot270 +no-q +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +validate 2., 0., 2., 0. +target = fcma + +aarch64 = fcadd +link-aarch64 = vcadd.rot270._EXT_ +generate float32x2_t +name = vcaddq_rot270 +generate float32x4_t, float64x2_t + +/// Floating-point complex add +name = vcadd_rot90 +no-q +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +validate 0., -2., 0., -2. +target = fcma + +aarch64 = fcadd +link-aarch64 = vcadd.rot90._EXT_ +generate float32x2_t +name = vcaddq_rot90 +generate float32x4_t, float64x2_t + +/// Floating-point complex multiply accumulate +name = vcmla +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +c = 1., 1., -1., -1. +validate 0., -2., 2., 0. +target = fcma + +aarch64 = fcmla +link-aarch64 = vcmla.rot0._EXT_ +generate float32x2_t, float32x4_t, float64x2_t + +/// Floating-point complex multiply accumulate +name = vcmla_rot90 +rot-suffix +a = 1., 1., 1., 1. +b = 1., -1., 1., -1. +c = 1., 1., 1., 1. +validate 2., 0., 2., 0. +target = fcma + +aarch64 = fcmla +link-aarch64 = vcmla.rot90._EXT_ +generate float32x2_t, float32x4_t, float64x2_t + +/// Floating-point complex multiply accumulate +name = vcmla_rot180 +rot-suffix +a = 1., 1., 1., 1. +b = 1., -1., 1., -1. +c = 1., 1., 1., 1. +validate 0., 0., 0., 0. +target = fcma + +aarch64 = fcmla +link-aarch64 = vcmla.rot180._EXT_ +generate float32x2_t, float32x4_t, float64x2_t + +/// Floating-point complex multiply accumulate +name = vcmla_rot270 +rot-suffix +a = 1., 1., 1., 1. +b = 1., -1., 1., -1. +c = 1., 1., 1., 1. +validate 0., 2., 0., 2. +target = fcma + +aarch64 = fcmla +link-aarch64 = vcmla.rot270._EXT_ +generate float32x2_t, float32x4_t, float64x2_t + +/// Floating-point complex multiply accumulate +name = vcmla +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_rot-LANE +multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} +multi_fn = vcmla-self-noext, a, b, c +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +c = 1., 1., -1., -1. +n = 0 +validate 0., -2., 0., -2. +target = fcma + +aarch64 = fcmla +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t +generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Floating-point complex multiply accumulate +name = vcmla_rot90 +rot-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_rot-LANE +multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} +multi_fn = vcmla_rot90-rot-noext, a, b, c +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +c = 1., 1., -1., -1. +n = 0 +validate 0., 0., 0., 0. +target = fcma + +aarch64 = fcmla +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t +generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Floating-point complex multiply accumulate +name = vcmla_rot180 +rot-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_rot-LANE +multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} +multi_fn = vcmla_rot180-rot-noext, a, b, c +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +c = 1., 1., -1., -1. +n = 0 +validate 2., 0., 2., 0. +target = fcma + +aarch64 = fcmla +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t +generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Floating-point complex multiply accumulate +name = vcmla_rot270 +rot-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_rot-LANE +multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} +multi_fn = vcmla_rot270-rot-noext, a, b, c +a = 1., -1., 1., -1. +b = -1., 1., -1., 1. +c = 1., 1., -1., -1. +n = 0 +validate 2., -2., 2., -2. +target = fcma + +aarch64 = fcmla +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t +generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Dot product arithmetic +name = vdot +out-suffix +a = 1, 2, 1, 2 +b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +validate 31, 176, 31, 176 +target = dotprod + +aarch64 = sdot +link-aarch64 = sdot._EXT_._EXT3_ +generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t + +aarch64 = udot +link-aarch64 = udot._EXT_._EXT3_ +generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t + +/// Dot product arithmetic +name = vdot +out-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = simd_shuffle-in_len-!, c:in_t, c, c, {base-4-LANE} +multi_fn = vdot-out-noext, a, b, c +a = 1, 2, 1, 2 +b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +n = 0 +validate 31, 72, 31, 72 +target = dotprod + +aarch64 = sdot +generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t +generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t + +aarch64 = udot +generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t +generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t + +/// Maximum (vector) +name = vmax +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16 + +arm = vmax +aarch64 = smax +link-arm = vmaxs._EXT_ +link-aarch64 = smax._EXT_ +generate int*_t + +arm = vmax +aarch64 = umax +link-arm = vmaxu._EXT_ +link-aarch64 = umax._EXT_ +generate uint*_t + +/// Maximum (vector) +name = vmax +a = 1.0, -2.0, 3.0, -4.0 +b = 0.0, 3.0, 2.0, 8.0 +validate 1.0, 3.0, 3.0, 8.0 + +aarch64 = fmax +link-aarch64 = fmax._EXT_ +generate float64x*_t + +arm = vmax +aarch64 = fmax +link-arm = vmaxs._EXT_ +link-aarch64 = fmax._EXT_ +generate float*_t + +/// Floating-point Maximum Number (vector) +name = vmaxnm +a = 1.0, 2.0, 3.0, -4.0 +b = 8.0, 16.0, -1.0, 6.0 +validate 8.0, 16.0, 3.0, 6.0 + +aarch64 = fmaxnm +link-aarch64 = fmaxnm._EXT_ +generate float64x*_t + +target = fp-armv8 +arm = vmaxnm +aarch64 = fmaxnm +link-arm = vmaxnm._EXT_ +link-aarch64 = fmaxnm._EXT_ +generate float*_t + +/// Floating-point maximum number across vector +name = vmaxnmv +a = 1., 2., 0., 1. +validate 2. + +aarch64 = fmaxnmp +link-aarch64 = fmaxnmv._EXT2_._EXT_ +generate float32x2_t:f32, float64x2_t:f64 +aarch64 = fmaxnmv +generate float32x4_t:f32 + +/// Floating-point Maximum Number Pairwise (vector). +name = vpmaxnm +a = 1.0, 2.0 +b = 6.0, -3.0 +validate 2.0, 6.0 +aarch64 = fmaxnmp +link-aarch64 = fmaxnmp._EXT_ +generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t + +/// Floating-point Maximum Number Pairwise (vector). +name = vpmaxnm +a = 1.0, 2.0, 3.0, -4.0 +b = 8.0, 16.0, -1.0, 6.0 +validate 2.0, 3.0, 16.0, 6.0 +aarch64 = fmaxnmp +link-aarch64 = fmaxnmp._EXT_ +generate float32x4_t:float32x4_t:float32x4_t + +/// Floating-point maximum number pairwise +name = vpmaxnm +out-suffix +a = 1., 2. +validate 2. + +aarch64 = fmaxnmp +link-aarch64 = fmaxnmv._EXT2_._EXT_ +generate float32x2_t:f32 +name = vpmaxnmq +generate float64x2_t:f64 + +/// Floating-point maximum pairwise +name = vpmax +out-suffix +a = 1., 2. +validate 2. + +aarch64 = fmaxp +link-aarch64 = fmaxv._EXT2_._EXT_ +generate float32x2_t:f32 +name = vpmaxq +generate float64x2_t:f64 + +/// Minimum (vector) +name = vmin +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 + +arm = vmin +aarch64 = smin +link-arm = vmins._EXT_ +link-aarch64 = smin._EXT_ +generate int*_t + +arm = vmin +aarch64 = umin +link-arm = vminu._EXT_ +link-aarch64 = umin._EXT_ +generate uint*_t + +/// Minimum (vector) +name = vmin +a = 1.0, -2.0, 3.0, -4.0 +b = 0.0, 3.0, 2.0, 8.0 +validate 0.0, -2.0, 2.0, -4.0 + +aarch64 = fmin +link-aarch64 = fmin._EXT_ +generate float64x*_t + +arm = vmin +aarch64 = fmin +link-arm = vmins._EXT_ +link-aarch64 = fmin._EXT_ +generate float*_t + +/// Floating-point Minimum Number (vector) +name = vminnm +a = 1.0, 2.0, 3.0, -4.0 +b = 8.0, 16.0, -1.0, 6.0 +validate 1.0, 2.0, -1.0, -4.0 + +aarch64 = fminnm +link-aarch64 = fminnm._EXT_ +generate float64x*_t + +target = fp-armv8 +arm = vminnm +aarch64 = fminnm +link-arm = vminnm._EXT_ +link-aarch64 = fminnm._EXT_ +generate float*_t + +/// Floating-point minimum number across vector +name = vminnmv +a = 1., 0., 2., 3. +validate 0. + +aarch64 = fminnmp +link-aarch64 = fminnmv._EXT2_._EXT_ +generate float32x2_t:f32, float64x2_t:f64 +aarch64 = fminnmv +generate float32x4_t:f32 + +/// Vector move +name = vmovl_high +no-q +multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen} +multi_fn = vmovl-noqself-noext, a +a = 1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10 +validate 3, 4, 5, 6, 7, 8, 9, 10 + +aarch64 = sxtl2 +generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t + +aarch64 = uxtl2 +generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t + +/// Floating-point add pairwise +name = vpadd +a = 1., 2., 3., 4. +b = 3., 4., 5., 6. +validate 3., 7., 7., 11. + +aarch64 = faddp +link-aarch64 = faddp._EXT_ +generate float32x4_t, float64x2_t + +arm = vpadd +link-arm = vpadd._EXT_ +generate float32x2_t + +/// Floating-point add pairwise +name = vpadd +out-suffix +multi_fn = simd_extract, a1:out_t, a, 0 +multi_fn = simd_extract, a2:out_t, a, 1 +multi_fn = a1 + a2 +a = 1., 2. +validate 3. + +aarch64 = nop +generate float32x2_t:f32, float64x2_t:f64 + +/// Floating-point Minimum Number Pairwise (vector). +name = vpminnm +a = 1.0, 2.0 +b = 6.0, -3.0 +validate 1.0, -3.0 + +aarch64 = fminnmp +link-aarch64 = fminnmp._EXT_ +generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t + +/// Floating-point Minimum Number Pairwise (vector). +name = vpminnm +a = 1.0, 2.0, 3.0, -4.0 +b = 8.0, 16.0, -1.0, 6.0 +validate 1.0, -4.0, 8.0, -1.0 +aarch64 = fminnmp +link-aarch64 = fminnmp._EXT_ +generate float32x4_t:float32x4_t:float32x4_t + +/// Floating-point minimum number pairwise +name = vpminnm +out-suffix +a = 1., 2. +validate 1. + +aarch64 = fminnmp +link-aarch64 = fminnmv._EXT2_._EXT_ +generate float32x2_t:f32 +name = vpminnmq +generate float64x2_t:f64 + +/// Floating-point minimum pairwise +name = vpmin +out-suffix +a = 1., 2. +validate 1. + +aarch64 = fminp +link-aarch64 = fminv._EXT2_._EXT_ +generate float32x2_t:f32 +name = vpminq +generate float64x2_t:f64 + +/// Signed saturating doubling multiply long +name = vqdmull +a = 0, 1, 2, 3, 4, 5, 6, 7 +b = 1, 2, 3, 4, 5, 6, 7, 8 +validate 0, 4, 12, 24, 40, 60, 84, 108 + +aarch64 = sqdmull +link-aarch64 = sqdmull._EXT2_ +arm = vqdmull +link-arm = vqdmull._EXT2_ +generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0 +a = 2 +b = 3 +validate 12 + +aarch64 = sqdmull +generate i16:i16:i32 + +/// Signed saturating doubling multiply long +name = vqdmull +a = 2 +b = 3 +validate 12 + +aarch64 = sqdmull +link-aarch64 = sqdmulls.scalar +generate i32:i32:i64 + +/// Vector saturating doubling long multiply with scalar +name = vqdmull_n +no-q +multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b} +a = 2, 4, 6, 8 +b = 2 +validate 8, 16, 24, 32 + +aarch64 = sqdmull +arm = vqdmull +generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull_high +no-q +multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen} +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen} +multi_fn = vqdmull-noqself-noext, a, b +a = 0, 1, 4, 5, 4, 5, 6, 7 +b = 1, 2, 5, 6, 5, 6, 7, 8 +validate 40, 60, 84, 112 + +aarch64 = sqdmull2 +generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull_high_n +no-q +multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len} +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vqdmull-in_ntt-noext, a, b +a = 0, 2, 8, 10, 8, 10, 12, 14 +b = 2 +validate 32, 40, 48, 56 + +aarch64 = sqdmull2 +generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t + +/// Vector saturating doubling long multiply by scalar +name = vqdmull_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32} +multi_fn = vqdmull-noqself-noext, a, b +a = 1, 2, 3, 4 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 4, 8, 12, 16 + +aarch64 = sqdmull +generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t + +arm = vqdmull +generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmullh_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmullh-noqself-noext, a, b +a = 2 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 8 + +aarch64 = sqdmull +generate i16:int16x4_t:i32, i16:int16x8_t:i32 + +/// Signed saturating doubling multiply long +name = vqdmulls_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmulls-noqself-noext, a, b +a = 2 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 8 + +aarch64 = sqdmull +generate i32:int32x2_t:i64, i32:int32x4_t:i64 + +/// Signed saturating doubling multiply long +name = vqdmull_high_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len} +multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32} +multi_fn = vqdmull-self-noext, a, b +a = 0, 1, 4, 5, 4, 5, 6, 7 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 16, 20, 24, 28 + +aarch64 = sqdmull2 +generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull_high_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len} +multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32} +multi_fn = vqdmull-noqself-noext, a, b +a = 0, 1, 4, 5, 4, 5, 6, 7 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 16, 20, 24, 28 + +aarch64 = sqdmull2 +generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal +multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c} +a = 1, 1, 1, 1 +b = 1, 2, 3, 4 +c = 2, 2, 2, 2 +validate 5, 9, 13, 17 + +aarch64 = sqdmlal +arm = vqdmlal +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Vector widening saturating doubling multiply accumulate with scalar +name = vqdmlal +n-suffix +multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c} +a = 1, 1, 1, 1 +b = 1, 2, 3, 4 +c = 2 +validate 5, 9, 13, 17 + +aarch64 = sqdmlal +arm = vqdmlal +generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal_high +no-q +multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c} +a = 1, 2, 3, 4 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 1, 2, 5, 6, 5, 6, 7, 8 +validate 41, 62, 87, 116 + +aarch64 = sqdmlal2 +generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal_high_n +no-q +multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} +a = 1, 2, 3, 4 +b = 0, 2, 8, 10, 8, 10, 12, 14 +c = 2 +validate 33, 42, 51, 60 + +aarch64 = sqdmlal2 +generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t + +/// Vector widening saturating doubling multiply accumulate with scalar +name = vqdmlal_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::<N>, b, c} +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 5, 10, 15, 20 + +aarch64 = sqdmlal +generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t + +arm = vqdmlal +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal_high_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c} +a = 1, 2, 3, 4 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate 17, 22, 27, 32 + +aarch64 = sqdmlal2 +generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal +multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c} +multi_fn = vqadd-out-noext, a, {simd_extract, x, 0} +a = 1 +b = 1 +c = 2 +validate 5 + +aarch64 = sqdmull +generate i32:i16:i16:i32, i64:i32:i32:i64 + +/// Signed saturating doubling multiply-add long +name = vqdmlalh_lane +in2-suffix +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vqdmlal-self-noext, a, b, {simd_extract, c, LANE as u32} +a = 1 +b = 1 +c = 2, 1, 1, 1, 1, 1, 1, 1 +n = 0 +validate 5 + +aarch64 = sqdmlal +generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32 +name = vqdmlals_lane +aarch64 = sqdmull +generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64 + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl +multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c} +a = 3, 7, 11, 15 +b = 1, 2, 3, 4 +c = 2, 2, 2, 2 +validate -1, -1, -1, -1 + +aarch64 = sqdmlsl +arm = vqdmlsl +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Vector widening saturating doubling multiply subtract with scalar +name = vqdmlsl +n-suffix +multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c} +a = 3, 7, 11, 15 +b = 1, 2, 3, 4 +c = 2 +validate -1, -1, -1, -1 + +aarch64 = sqdmlsl +arm = vqdmlsl +generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl_high +no-q +multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c} +a = 39, 58, 81, 108 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 1, 2, 5, 6, 5, 6, 7, 8 +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl2 +generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl_high_n +no-q +multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} +a = 31, 38, 45, 52 +b = 0, 2, 8, 10, 8, 10, 12, 14 +c = 2 +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl2 +generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t + +/// Vector widening saturating doubling multiply subtract with scalar +name = vqdmlsl_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::<N>, b, c} +a = 3, 6, 9, 12 +b = 1, 2, 3, 4 +c = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl +generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t + +arm = vqdmlsl +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl_high_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c} +a = 15, 18, 21, 24 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl2 +generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl +multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c} +multi_fn = vqsub-out-noext, a, {simd_extract, x, 0} +a = 10 +b = 1 +c = 2 +validate 6 + +aarch64 = sqdmull +generate i32:i16:i16:i32, i64:i32:i32:i64 + +/// Signed saturating doubling multiply-subtract long +name = vqdmlslh_lane +in2-suffix +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vqdmlsl-self-noext, a, b, {simd_extract, c, LANE as u32} +a = 10 +b = 1 +c = 2, 1, 1, 1, 1, 1, 1, 1 +n = 0 +validate 6 + +aarch64 = sqdmlsl +generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32 +name = vqdmlsls_lane +aarch64 = sqdmull +generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64 + +/// Signed saturating doubling multiply returning high half +name = vqdmulh +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2, 2, 2, 2, 2, 2, 2, 2 +validate 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = sqdmulh +link-aarch64 = sqdmulh._EXT_ +arm = vqdmulh +link-arm = vqdmulh._EXT_ +generate int16x4_t, int16x8_t, int32x2_t, int32x4_t + +/// Signed saturating doubling multiply returning high half +name = vqdmulh +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0 +a = 1 +b = 2 +validate 0 + +aarch64 = sqdmulh +generate i16, i32 + +/// Vector saturating doubling multiply high with scalar +name = vqdmulh_n +out-suffix +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vqdmulh-out-noext, a, b +a = MAX, MAX, MAX, MAX +b = 2 +validate 1, 1, 1, 1 + +aarch64 = sqdmulh +arm = vqdmulh +generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t + +/// Vector saturating doubling multiply high with scalar +name = vqdmulhq_n +no-q +multi_fn = vdupq_n-in_ntt-noext, b:out_t, b +multi_fn = vqdmulh-out-noext, a, b +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2 +validate 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = sqdmulh +arm = vqdmulh +generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t + +/// Signed saturating doubling multiply returning high half +name = vqdmulhh_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmulhh-out_ntt-noext, a, b +a = 2 +b = 0, 0, MAX, 0, 0, 0, 0, 0 +n = 2 +validate 1 + +aarch64 = sqdmulh +generate i16:int16x4_t:i16, i16:int16x8_t:i16 + +/// Signed saturating doubling multiply returning high half +name = vqdmulhs_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmulhs-out_ntt-noext, a, b +a = 2 +b = 0, MAX, 0, 0 +n = 1 +validate 1 + +aarch64 = sqdmulh +generate i32:int32x2_t:i32, i32:int32x4_t:i32 + +/// Vector saturating doubling multiply high by scalar +name = vqdmulh +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vqdmulh-out-noext, a, {vdup-nout-noext, {simd_extract, b, LANE as u32}} +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2, 1, 1, 1, 1, 1, 1, 1 +n = 0 +validate 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = sqdmulh +generate int16x4_t, int16x8_t:int16x4_t:int16x8_t +generate int32x2_t, int32x4_t:int32x2_t:int32x4_t +arm = vqdmulh +generate int16x8_t, int16x4_t:int16x8_t:int16x4_t +generate int32x4_t, int32x2_t:int32x4_t:int32x2_t + +/// Signed saturating extract narrow +name = vqmovn +no-q +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX + +aarch64 = sqxtn +link-aarch64 = sqxtn._EXT2_ +arm = vqmovn +link-arm = vqmovns._EXT2_ +generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t + +/// Unsigned saturating extract narrow +name = vqmovn +no-q +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX + +aarch64 = uqxtn +link-aarch64 = uqxtn._EXT2_ +arm = vqmovn +link-arm = vqmovnu._EXT2_ +generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t + +/// Saturating extract narrow +name = vqmovn +multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 +a = 1 +validate 1 + +aarch64 = sqxtn +generate i16:i8, i32:i16 +aarch64 = uqxtn +generate u16:u8, u32:u16 + +/// Saturating extract narrow +name = vqmovn +a = 1 +validate 1 + +aarch64 = sqxtn +link-aarch64 = scalar.sqxtn._EXT2_._EXT_ +generate i64:i32 + +aarch64 = uqxtn +link-aarch64 = scalar.uqxtn._EXT2_._EXT_ +generate u64:u32 + +/// Signed saturating extract narrow +name = vqmovn_high +no-q +multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len} +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX + +aarch64 = sqxtn2 +generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t +aarch64 = uqxtn2 +generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t + +/// Signed saturating extract unsigned narrow +name = vqmovun +no-q +a = -1, -1, -1, -1, -1, -1, -1, -1 +validate 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = sqxtun +link-aarch64 = sqxtun._EXT2_ +arm = vqmovun +link-arm = vqmovnsu._EXT2_ +generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t + +/// Signed saturating extract unsigned narrow +name = vqmovun +multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 +a = 1 +validate 1 + +aarch64 = sqxtun +generate i16:u8, i32:u16, i64:u32 + +/// Signed saturating extract unsigned narrow +name = vqmovun_high +no-q +multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len} +a = 0, 0, 0, 0, 0, 0, 0, 0 +b = -1, -1, -1, -1, -1, -1, -1, -1 +validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = sqxtun2 +generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t + +/// Signed saturating rounding doubling multiply returning high half +name = vqrdmulh +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2, 2, 2, 2, 2, 2, 2, 2 +validate 2, 2, 2, 2, 2, 2, 2, 2 + +aarch64 = sqrdmulh +link-aarch64 = sqrdmulh._EXT_ +arm = vqrdmulh +link-arm = vqrdmulh._EXT_ +generate int16x4_t, int16x8_t, int32x2_t, int32x4_t + +/// Signed saturating rounding doubling multiply returning high half +name = vqrdmulh +multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 1 +b = 2 +validate 0 + +aarch64 = sqrdmulh +generate i16, i32 + +/// Vector saturating rounding doubling multiply high with scalar +name = vqrdmulh +out-n-suffix +multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b} +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2 +validate 2, 2, 2, 2, 2, 2, 2, 2 + +aarch64 = sqrdmulh +arm = vqrdmulh +generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t + +/// Vector rounding saturating doubling multiply high by scalar +name = vqrdmulh +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32} +multi_fn = vqrdmulh-out-noext, a, b +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 0, 2, 0, 0, 0, 0, 0, 0, +n = 1 +validate 2, 2, 2, 2, 2, 2, 2, 2 + +aarch64 = sqrdmulh +arm = vqrdmulh +generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t + +/// Signed saturating rounding doubling multiply returning high half +name = vqrdmulh +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32} +a = 1 +b = 0, 2, 0, 0, 0, 0, 0, 0, +n = 1 +validate 0 + +aarch64 = sqrdmulh +generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32 + +/// Signed saturating rounding doubling multiply accumulate returning high half +name = vqrdmlah +a = 1, 1, 1, 1, 1, 1, 1, 1 +b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +c = 2, 2, 2, 2, 2, 2, 2, 2 +validate 3, 3, 3, 3, 3, 3, 3, 3 + +aarch64 = sqrdmlah +link-aarch64 = sqrdmlah._EXT_ +target = rdm +generate int16x4_t, int16x8_t, int32x2_t, int32x4_t + +/// Signed saturating rounding doubling multiply accumulate returning high half +name = vqrdmlah +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c +multi_fn = simd_extract, {vqrdmlah-in_ntt-noext, a, b, c}, 0 +a = 1 +b = 1 +c = 2 +validate 1 + +aarch64 = sqrdmlah +target = rdm +generate i16, i32 + +/// Signed saturating rounding doubling multiply accumulate returning high half +name = vqrdmlah +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32} +multi_fn = vqrdmlah-out-noext, a, b, c +a = 1, 1, 1, 1, 1, 1, 1, 1 +b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate 3, 3, 3, 3, 3, 3, 3, 3 + +aarch64 = sqrdmlah +target = rdm +generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t + +/// Signed saturating rounding doubling multiply accumulate returning high half +name = vqrdmlah +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vqrdmlah-self-noext, a, b, {simd_extract, c, LANE as u32} +a = 1 +b = 1 +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate 1 + +aarch64 = sqrdmlah +target = rdm +generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 + +/// Signed saturating rounding doubling multiply subtract returning high half +name = vqrdmlsh +link-aarch64 = sqrdmlsh._EXT_ +a = 1, 1, 1, 1, 1, 1, 1, 1 +b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +c = 2, 2, 2, 2, 2, 2, 2, 2 +validate -1, -1, -1, -1, -1, -1, -1, -1 + +aarch64 = sqrdmlsh +target = rdm +generate int16x4_t, int16x8_t, int32x2_t, int32x4_t + +/// Signed saturating rounding doubling multiply subtract returning high half +name = vqrdmlsh +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c +multi_fn = simd_extract, {vqrdmlsh-in_ntt-noext, a, b, c}, 0 +a = 1 +b = 1 +c = 2 +validate 1 + +aarch64 = sqrdmlsh +target = rdm +generate i16, i32 + +/// Signed saturating rounding doubling multiply subtract returning high half +name = vqrdmlsh +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32} +multi_fn = vqrdmlsh-out-noext, a, b, c +a = 1, 1, 1, 1, 1, 1, 1, 1 +b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate -1, -1, -1, -1, -1, -1, -1, -1 + +aarch64 = sqrdmlsh +target = rdm +generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t + +/// Signed saturating rounding doubling multiply subtract returning high half +name = vqrdmlsh +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vqrdmlsh-self-noext, a, b, {simd_extract, c, LANE as u32} +a = 1 +b = 1 +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate 1 + +aarch64 = sqrdmlsh +target = rdm +generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 + +/// Signed saturating rounding shift left +name = vqrshl +a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + +aarch64 = sqrshl +link-aarch64 = sqrshl._EXT_ +generate i32, i64 + +arm = vqrshl +link-arm = vqrshifts._EXT_ +generate int*_t, int64x*_t + +/// Signed saturating rounding shift left +name = vqrshl +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0 +a = 1 +b = 2 +validate 4 + +aarch64 = sqrshl +generate i8, i16 + +/// Unsigned signed saturating rounding shift left +name = vqrshl +out-suffix +a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + +aarch64 = uqrshl +link-aarch64 = uqrshl._EXT_ +generate u32:i32:u32, u64:i64:u64 + +arm = vqrshl +link-arm = vqrshiftu._EXT_ +generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t +generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t + +/// Unsigned signed saturating rounding shift left +name = vqrshl +out-suffix +multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0 +a = 1 +b = 2 +validate 4 + +aarch64 = uqrshl +generate u8:i8:u8, u16:i16:u16 + +/// Signed saturating rounded shift right narrow +name = vqrshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = MIN, 4, 8, 12, 16, 20, 24, 28 +n = 2 +validate MIN, 1, 2, 3, 4, 5, 6, 7 + +aarch64 = sqrshrn +link-aarch64 = sqrshrn._EXT2_ +const-aarch64 = N + +arm = vqrshrn +link-arm = vqrshiftns._EXT2_ +const-arm = -N as ttn +arm-aarch64-separate +generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t + +/// Signed saturating rounded shift right narrow +name = vqrshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a +multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0 +a = 4 +n = 2 +validate 1 + +aarch64 = sqrshrn +generate i16:i8, i32:i16, i64:i32 + +/// Signed saturating rounded shift right narrow +name = vqrshrn_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 2, 3, 2, 3, 6, 7 +b = 8, 12, 24, 28, 48, 52, 56, 60 +n = 2 +validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 + +aarch64 = sqrshrn2 +generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t + +/// Unsigned signed saturating rounded shift right narrow +name = vqrshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = MIN, 4, 8, 12, 16, 20, 24, 28 +n = 2 +validate 0, 1, 2, 3, 4, 5, 6, 7 + +aarch64 = uqrshrn +link-aarch64 = uqrshrn._EXT2_ +const-aarch64 = N + +arm = vqrshrn +link-arm = vqrshiftnu._EXT2_ +const-arm = -N as ttn +arm-aarch64-separate +generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t + +/// Unsigned saturating rounded shift right narrow +name = vqrshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a +multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0 +a = 4 +n = 2 +validate 1 + +aarch64 = uqrshrn +generate u16:u8, u32:u16, u64:u32 + +/// Unsigned saturating rounded shift right narrow +name = vqrshrn_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 2, 3, 2, 3, 6, 7 +b = 8, 12, 24, 28, 48, 52, 56, 60 +n = 2 +validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 + +aarch64 = uqrshrn2 +generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t + +/// Signed saturating rounded shift right unsigned narrow +name = vqrshrun +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = 0, 4, 8, 12, 16, 20, 24, 28 +n = 2 +validate 0, 1, 2, 3, 4, 5, 6, 7 + +aarch64 = sqrshrun +link-aarch64 = sqrshrun._EXT2_ +const-aarch64 = N + +arm = vqrshrun +link-arm = vqrshiftnsu._EXT2_ +const-arm = -N as ttn +arm-aarch64-separate +generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t + +/// Signed saturating rounded shift right unsigned narrow +name = vqrshrun +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a +multi_fn = simd_extract, {vqrshrun_n-in_ntt-::<N>, a}, 0 +a = 4 +n = 2 +validate 1 + +aarch64 = sqrshrun +generate i16:u8, i32:u16, i64:u32 + +/// Signed saturating rounded shift right unsigned narrow +name = vqrshrun_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 2, 3, 2, 3, 6, 7 +b = 8, 12, 24, 28, 48, 52, 56, 60 +n = 2 +validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 + +aarch64 = sqrshrun2 +generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t + +/// Signed saturating shift left +name = vqshl +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + +aarch64 = sqshl +link-aarch64 = sqshl._EXT_ +generate i64 + +arm = vqshl +link-arm = vqshifts._EXT_ +generate int*_t, int64x*_t + +/// Signed saturating shift left +name = vqshl +multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b} +multi_fn = simd_extract, c, 0 +a = 1 +b = 2 +validate 4 + +aarch64 = sqshl +generate i8, i16, i32 + +/// Unsigned saturating shift left +name = vqshl +out-suffix +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + +aarch64 = uqshl +link-aarch64 = uqshl._EXT_ +generate u64:i64:u64 + +arm = vqshl +link-arm = vqshiftu._EXT_ +generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t +generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t + +/// Unsigned saturating shift left +name = vqshl +out-suffix +multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b} +multi_fn = simd_extract, c, 0 +a = 1 +b = 2 +validate 4 + +aarch64 = uqshl +generate u8:i8:u8, u16:i16:u16, u32:i32:u32 + +/// Signed saturating shift left +name = vqshl +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N as _} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +n = 2 +validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + +aarch64 = sqshl +arm = vqshl +generate int*_t, int64x*_t + +/// Signed saturating shift left +name = vqshl +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 +a = 1 +n = 2 +validate 4 + +aarch64 = sqshl +generate i8, i16, i32, i64 + +/// Unsigned saturating shift left +name = vqshl +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N as _} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +n = 2 +validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + +aarch64 = uqshl +arm = vqshl +generate uint*_t, uint64x*_t + +/// Unsigned saturating shift left +name = vqshl +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 +a = 1 +n = 2 +validate 4 + +aarch64 = uqshl +generate u8, u16, u32, u64 + +/// Signed saturating shift left unsigned +name = vqshlu +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +n = 2 +validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 +arm-aarch64-separate + +aarch64 = sqshlu +link-aarch64 = sqshlu._EXT_ +const-aarch64 = {dup-in_len-N as ttn} +arm = vqshlu +link-arm = vqshiftsu._EXT_ +const-arm = N as ttn +generate int8x8_t:uint8x8_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t +generate int8x16_t:uint8x16_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t + +/// Signed saturating shift left unsigned +name = vqshlu +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +multi_fn = simd_extract, {vqshlu_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 +a = 1 +n = 2 +validate 4 + +aarch64 = sqshlu +generate i8:u8, i16:u16, i32:u32, i64:u64 + +/// Signed saturating shift right narrow +name = vqshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = 0, 4, 8, 12, 16, 20, 24, 28 +n = 2 +validate 0, 1, 2, 3, 4, 5, 6, 7 +arm-aarch64-separate + +aarch64 = sqshrn +link-aarch64 = sqshrn._EXT2_ +const-aarch64 = N +generate i64:i32 + +arm = vqshrn +link-arm = vqshiftns._EXT2_ +const-arm = -N as ttn +generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t + +/// Signed saturating shift right narrow +name = vqshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 +a = 4 +n = 2 +validate 1 + +aarch64 = sqshrn +generate i16:i8, i32:i16 + +/// Signed saturating shift right narrow +name = vqshrn_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 8, 9, 8, 9, 10, 11 +b = 32, 36, 40, 44, 48, 52, 56, 60 +n = 2 +validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = sqshrn2 +generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t + +/// Unsigned saturating shift right narrow +name = vqshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = 0, 4, 8, 12, 16, 20, 24, 28 +n = 2 +validate 0, 1, 2, 3, 4, 5, 6, 7 +arm-aarch64-separate + +aarch64 = uqshrn +link-aarch64 = uqshrn._EXT2_ +const-aarch64 = N +generate u64:u32 + +arm = vqshrn +link-arm = vqshiftnu._EXT2_ +const-arm = -N as ttn +generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t + +/// Unsigned saturating shift right narrow +name = vqshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 +a = 4 +n = 2 +validate 1 + +aarch64 = uqshrn +generate u16:u8, u32:u16 + +/// Unsigned saturating shift right narrow +name = vqshrn_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 8, 9, 8, 9, 10, 11 +b = 32, 36, 40, 44, 48, 52, 56, 60 +n = 2 +validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = uqshrn2 +generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t + +/// Signed saturating shift right unsigned narrow +name = vqshrun +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = 0, 4, 8, 12, 16, 20, 24, 28 +n = 2 +validate 0, 1, 2, 3, 4, 5, 6, 7 +arm-aarch64-separate + +aarch64 = sqshrun +link-aarch64 = sqshrun._EXT2_ +const-aarch64 = N + +arm = vqshrun +link-arm = vqshiftnsu._EXT2_ +const-arm = -N as ttn +generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t + +/// Signed saturating shift right unsigned narrow +name = vqshrun +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_extract, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 +a = 4 +n = 2 +validate 1 + +aarch64 = sqshrun +generate i16:u8, i32:u16, i64:u32 + +/// Signed saturating shift right unsigned narrow +name = vqshrun_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 8, 9, 8, 9, 10, 11 +b = 32, 36, 40, 44, 48, 52, 56, 60 +n = 2 +validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = sqshrun2 +generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t + +/// Unsigned saturating accumulate of signed value +name = vsqadd +out-suffix +multi_fn = simd_extract, {vsqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 +a = 2 +b = 2 +validate 4 + +aarch64 = usqadd +generate u8:i8:u8, u16:i16:u16 + +/// Unsigned saturating accumulate of signed value +name = vsqadd +out-suffix +a = 2 +b = 2 +validate 4 + +aarch64 = usqadd +link-aarch64 = usqadd._EXT_ +generate u32:i32:u32, u64:i64:u64 + +/// Calculates the square root of each lane. +name = vsqrt +fn = simd_fsqrt +a = 4.0, 9.0, 16.0, 25.0 +validate 2.0, 3.0, 4.0, 5.0 + +aarch64 = fsqrt +generate float*_t, float64x*_t + +/// Reciprocal square-root estimate. +name = vrsqrte +a = 1.0, 2.0, 3.0, 4.0 +validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375 + +aarch64 = frsqrte +link-aarch64 = frsqrte._EXT_ +generate float64x*_t, f32, f64 + +arm = vrsqrte +link-arm = vrsqrte._EXT_ +generate float*_t + +/// Unsigned reciprocal square root estimate +name = vrsqrte +a = 1, 2, 3, 4 +validate 4294967295, 4294967295, 4294967295, 4294967295 + +aarch64 = ursqrte +link-aarch64 = ursqrte._EXT_ +arm = vrsqrte +link-arm = vrsqrte._EXT_ +generate uint32x2_t, uint32x4_t + +/// Floating-point reciprocal square root step +name = vrsqrts +a = 1.0, 2.0, 3.0, 4.0 +b = 1.0, 2.0, 3.0, 4.0 +validate 1., -0.5, -3.0, -6.5 + +aarch64 = frsqrts +link-aarch64 = frsqrts._EXT_ +generate float64x*_t, f32, f64 + +arm = vrsqrts +link-arm = vrsqrts._EXT_ +generate float*_t + +/// Reciprocal estimate. +name = vrecpe +a = 4.0, 3.0, 2.0, 1.0 +validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875 + +aarch64 = frecpe +link-aarch64 = frecpe._EXT_ +generate float64x*_t, f32, f64 + +arm = vrecpe +link-arm = vrecpe._EXT_ +generate float*_t + +/// Unsigned reciprocal estimate +name = vrecpe +a = 4, 3, 2, 1 +validate 4294967295, 4294967295, 4294967295, 4294967295 + +aarch64 = urecpe +link-aarch64 = urecpe._EXT_ +arm = vrecpe +link-arm = vrecpe._EXT_ +generate uint32x2_t, uint32x4_t + +/// Floating-point reciprocal step +name = vrecps +a = 4.0, 3.0, 2.0, 1.0 +b = 4.0, 3.0, 2.0, 1.0 +validate -14., -7., -2., 1. + +aarch64 = frecps +link-aarch64 = frecps._EXT_ +generate float64x*_t, f32, f64 + +arm = vrecps +link-arm = vrecps._EXT_ +generate float*_t + +/// Floating-point reciprocal exponent +name = vrecpx +a = 4.0 +validate 0.5 + +aarch64 = frecpx +link-aarch64 = frecpx._EXT_ +generate f32, f64 + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = nop +generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t +generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t + +arm = nop +generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t +generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t +generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t +generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t +generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t +generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 1, 2, 3, 4, 5, 6, 7 +validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 + +aarch64 = nop +arm = nop +generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t +generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t +generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t +generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t +generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t +generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t +target = aes +generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t +generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t +generate p128:int64x2_t, p128:uint64x2_t, p128:poly64x2_t + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +validate 0, 1, 2, 3, 4, 5, 6, 7 + +aarch64 = nop +arm = nop +generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t +generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t +generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t +generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t +generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t +generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t +target = aes +generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t +generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t +generate int64x2_t:p128, uint64x2_t:p128, poly64x2_t:p128 + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 1, 2, 3 +validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 + +aarch64 = nop +arm = nop +generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t +generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t +generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t +generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t +generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t +generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t +target = aes +generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t +generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t +generate p128:int32x4_t, p128:uint32x4_t + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +validate 0, 1, 2, 3 + +aarch64 = nop +arm = nop +generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t +generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t +generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t +generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t +target = aes +generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t +generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t +generate int32x4_t:p128, uint32x4_t:p128 + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 1 +validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = nop +arm = nop +generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t +generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t +target = aes +generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t +generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t +generate p128:int16x8_t, p128:uint16x8_t, p128:poly16x8_t + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 +validate 0, 1 + +aarch64 = nop +arm = nop +generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t +generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t +target = aes +generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t +generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t +generate int16x8_t:p128, uint16x8_t:p128, poly16x8_t:p128 + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate 1 +target = aes + +aarch64 = nop +arm = nop +generate int8x16_t:p128, uint8x16_t:p128, poly8x16_t:p128 + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 1 +validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +target = aes + +aarch64 = nop +arm = nop +generate p128:int8x16_t, p128:uint8x16_t, p128:poly8x16_t + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0., 0., 0., 0., 0., 0., 0., 0. +validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = nop +generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t +generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t +generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t +generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t +generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t +generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t +generate float64x2_t:p128 + +arm = nop +generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t +generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t +generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t +generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t +generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t +generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t +generate float32x4_t:p128 + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +validate 0., 0., 0., 0., 0., 0., 0., 0. + +aarch64 = nop +generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t +generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t +generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t +generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t +generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t +generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t +generate p128:float64x2_t + +arm = nop +generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t +generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t +generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t +generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t +generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t +generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t +generate p128:float32x4_t + +/// Vector reinterpret cast operation +name = vreinterpret +double-suffixes +fn = transmute +a = 0., 0., 0., 0., 0., 0., 0., 0. +validate 0., 0., 0., 0., 0., 0., 0., 0. + +aarch64 = nop +generate float32x2_t:float64x1_t, float64x1_t:float32x2_t +generate float32x4_t:float64x2_t, float64x2_t:float32x4_t + +/// Signed rounding shift left +name = vrshl +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 + +aarch64 = srshl +link-aarch64 = srshl._EXT_ +generate i64 + +arm = vrshl +link-arm = vrshifts._EXT_ +generate int*_t, int64x*_t + +/// Unsigned rounding shift left +name = vrshl +out-suffix +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 + +aarch64 = urshl +link-aarch64 = urshl._EXT_ +generate u64:i64:u64 + +arm = vrshl +link-arm = vrshiftu._EXT_ +generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t +generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t + +/// Signed rounding shift right +name = vrshr +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _} +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = srshr +arm = vrshr +generate int*_t, int64x*_t + +/// Signed rounding shift right +name = vrshr +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = vrshl-self-noext, a, -N as i64 +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = srshr +generate i64 + +/// Unsigned rounding shift right +name = vrshr +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _} +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = urshr +arm = vrshr +generate uint*_t, uint64x*_t + +/// Unsigned rounding shift right +name = vrshr +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = vrshl-self-noext, a, -N as i64 +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = urshr +generate u64 + +/// Rounding shift right narrow +name = vrshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +arm-aarch64-separate + +aarch64 = rshrn +link-aarch64 = rshrn._EXT2_ +const-aarch64 = N + +arm = vrshrn +link-arm = vrshiftn._EXT2_ +const-arm = -N as ttn +generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t + +/// Rounding shift right narrow +name = vrshrn +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = transmute, {vrshrn_n-noqsigned-::<N>, transmute(a)} +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = rshrn +arm = vrshrn +generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t + +/// Rounding shift right narrow +name = vrshrn_high +noq-n-suffix +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len} +a = 0, 1, 8, 9, 8, 9, 10, 11 +b = 32, 36, 40, 44, 48, 52, 56, 60 +n = 2 +validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = rshrn2 +generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t +generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t + +/// Signed rounding shift right and accumulate +name = vrsra +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = simd_add, a, {vrshr-nself-::<N>, b} +a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 + +aarch64 = srsra +arm = vrsra +generate int*_t, int64x*_t + +/// Unsigned rounding shift right and accumulate +name = vrsra +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = simd_add, a, {vrshr-nself-::<N>, b} +a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 + +aarch64 = ursra +arm = vrsra +generate uint*_t, uint64x*_t + +/// Signed rounding shift right and accumulate. +name = vrsra +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = vrshr-nself-::<N>, b:in_t, b +multi_fn = a.wrapping_add(b) +a = 1 +b = 4 +n = 2 +validate 2 + +aarch64 = srsra +generate i64 + +/// Ungisned rounding shift right and accumulate. +name = vrsra +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = vrshr-nself-::<N>, b:in_t, b +multi_fn = a.wrapping_add(b) +a = 1 +b = 4 +n = 2 +validate 2 + +aarch64 = ursra +generate u64 + +/// Rounding subtract returning high narrow +name = vrsubhn +no-q +a = MAX, MIN, 0, 4, 5, 6, 7, 8 +b = 1, 2, 3, 4, 5, 6, 7, 8 +validate MIN, MIN, 0, 0, 0, 0, 0, 0 + +aarch64 = rsubhn +link-aarch64 = rsubhn._EXT2_ +arm = vrsubhn +link-arm = vrsubhn._EXT2_ +generate int16x8_t:int16x8_t:int8x8_t, int32x4_t:int32x4_t:int16x4_t, int64x2_t:int64x2_t:int32x2_t + +/// Rounding subtract returning high narrow +name = vrsubhn +no-q +multi_fn = transmute, {vrsubhn-noqsigned-noext, {transmute, a}, {transmute, b}} +a = MAX, MIN, 3, 4, 5, 6, 7, 8 +b = 1, 2, 3, 4, 5, 6, 7, 8 +validate 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = rsubhn +arm = vrsubhn +generate uint16x8_t:uint16x8_t:uint8x8_t, uint32x4_t:uint32x4_t:uint16x4_t, uint64x2_t:uint64x2_t:uint32x2_t + +/// Rounding subtract returning high narrow +name = vrsubhn_high +no-q +multi_fn = vrsubhn-noqself-noext, x:in_t0, b, c +multi_fn = simd_shuffle-out_len-!, a, x, {asc-0-out_len} +a = 1, 2, 0, 0, 0, 0, 0, 0 +b = 1, 2, 3, 4, 5, 6, 7, 8 +c = 1, 2, 3, 4, 5, 6, 7, 8 +validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +aarch64 = rsubhn2 +generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t +generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t + +/// Insert vector element from another vector element +name = vset_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_insert, b, LANE as u32, a +a = 1 +b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +n = 0 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = nop +arm = nop +generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t +generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t +generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t +generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t +generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t + +target = aes +generate p64:poly64x1_t:poly64x1_t + +/// Insert vector element from another vector element +name = vsetq_lane +no-q +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_insert, b, LANE as u32, a +a = 1 +b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +n = 0 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +aarch64 = nop +arm = nop +generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t +generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t +generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t +generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t +generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t + +target = aes +generate p64:poly64x2_t:poly64x2_t + +/// Insert vector element from another vector element +name = vset_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_insert, b, LANE as u32, a +a = 1. +b = 0., 2., 3., 4. +n = 0 +validate 1., 2., 3., 4. + +aarch64 = nop +generate f64:float64x1_t:float64x1_t + +arm = nop +generate f32:float32x2_t:float32x2_t + +/// Insert vector element from another vector element +name = vsetq_lane +no-q +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_insert, b, LANE as u32, a +a = 1. +b = 0., 2., 3., 4. +n = 0 +validate 1., 2., 3., 4. + +aarch64 = nop +generate f64:float64x2_t:float64x2_t + +arm = nop +generate f32:float32x4_t:float32x4_t + +/// Signed Shift left +name = vshl +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 + +aarch64 = sshl +link-aarch64 = sshl._EXT_ +arm = vshl +link-arm = vshifts._EXT_ +generate int*_t, int64x*_t + +/// Signed Shift left +name = vshl +multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)} +a = 1 +b = 2 +validate 4 + +aarch64 = sshl +generate i64 + +/// Unsigned Shift left +name = vshl +out-suffix +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 + +aarch64 = ushl +link-aarch64 = ushl._EXT_ +arm = vshl +link-arm = vshiftu._EXT_ +generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t +generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t + +/// Unsigned Shift left +out-suffix +name = vshl +multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)} +a = 1 +b = 2 +validate 4 + +aarch64 = ushl +generate u64:i64:u64 + +/// Shift left +name = vshl +n-suffix +constn = N +multi_fn = static_assert_imm-out_bits_exp_len-N +multi_fn = simd_shl, a, {vdup-nself-noext, N as _} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +n = 2 +validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 + +arm = vshl +aarch64 = shl +generate int*_t, uint*_t, int64x*_t, uint64x*_t + +/// Signed shift left long +name = vshll +n-suffix +constn = N +multi_fn = static_assert-N-0-bits +multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N as _} +a = 1, 2, 3, 4, 5, 6, 7, 8 +n = 2 +validate 4, 8, 12, 16, 20, 24, 28, 32 + +arm = vshll.s +aarch64 = sshll +generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t +aarch64 = ushll +generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t + +/// Signed shift left long +name = vshll_high_n +no-q +constn = N +multi_fn = static_assert-N-0-bits +multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen} +multi_fn = vshll_n-noqself-::<N>, b +a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8 +n = 2 +validate 4, 8, 12, 16, 20, 24, 28, 32 + +aarch64 = sshll2 +generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t +aarch64 = ushll2 +generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t + +/// Shift right +name = vshr +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = fix_right_shift_imm-N-bits +multi_fn = simd_shr, a, {vdup-nself-noext, n as _} +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +arm = vshr.s +aarch64 = sshr +generate int*_t, int64x*_t +aarch64 = ushr +generate uint*_t, uint64x*_t + +/// Shift right narrow +name = vshrn_n +no-q +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N as _}} +a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + +arm = vshrn. +aarch64 = shrn +generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t +generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t + +/// Shift right narrow +name = vshrn_high_n +no-q +constn = N +multi_fn = static_assert-N-1-halfbits +multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len} +a = 1, 2, 5, 6, 5, 6, 7, 8 +b = 20, 24, 28, 32, 52, 56, 60, 64 +n = 2 +validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16 + +aarch64 = shrn2 +generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t +generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t + +/// Signed shift right and accumulate +name = vsra +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = simd_add, a, {vshr-nself-::<N>, b} +a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 + +aarch64 = ssra +arm = vsra +generate int*_t, int64x*_t + +/// Unsigned shift right and accumulate +name = vsra +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = simd_add, a, {vshr-nself-::<N>, b} +a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 +n = 2 +validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 + +aarch64 = usra +arm = vsra +generate uint*_t, uint64x*_t + +/// SM3PARTW1 +name = vsm3partw1 +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 1, 2, 3, 4 +validate 2147549312, 3221323968, 131329, 2684362752 +target = sm4 + +aarch64 = sm3partw1 +link-aarch64 = llvm.aarch64.crypto.sm3partw1 +generate uint32x4_t + +/// SM3PARTW2 +name = vsm3partw2 +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 1, 2, 3, 4 +validate 128, 256, 384, 1077977696 +target = sm4 + +aarch64 = sm3partw2 +link-aarch64 = llvm.aarch64.crypto.sm3partw2 +generate uint32x4_t + +/// SM3SS1 +name = vsm3ss1 +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 1, 2, 3, 4 +validate 0, 0, 0, 2098176 +target = sm4 + +aarch64 = sm3ss1 +link-aarch64 = llvm.aarch64.crypto.sm3ss1 +generate uint32x4_t + +/// SM4 key +name = vsm4ekey +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +validate 1784948604, 136020997, 2940231695, 3789947679 +target = sm4 + +aarch64 = sm4ekey +link-aarch64 = llvm.aarch64.crypto.sm4ekey +generate uint32x4_t + +/// SM4 encode +name = vsm4e +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +validate 1093874472, 3616769504, 3878330411, 2765298765 +target = sm4 + +aarch64 = sm4e +link-aarch64 = llvm.aarch64.crypto.sm4e +generate uint32x4_t + +/// Rotate and exclusive OR +name = vrax1 +a = 1, 2 +b = 3, 4 +validate 7, 10 +target = sha3 + +aarch64 = rax1 +link-aarch64 = llvm.aarch64.crypto.rax1 +generate uint64x2_t + +/// SHA512 hash update part 1 +name = vsha512h +a = 1, 2 +b = 3, 4 +c = 5, 6 +validate 11189044327219203, 7177611956453380 +target = sha3 + +aarch64 = sha512h +link-aarch64 = llvm.aarch64.crypto.sha512h +generate uint64x2_t + +/// SHA512 hash update part 2 +name = vsha512h2 +a = 1, 2 +b = 3, 4 +c = 5, 6 +validate 5770237651009406214, 349133864969 +target = sha3 + +aarch64 = sha512h2 +link-aarch64 = llvm.aarch64.crypto.sha512h2 +generate uint64x2_t + +/// SHA512 schedule update 0 +name = vsha512su0 +a = 1, 2 +b = 3, 4 +validate 144115188075855874, 9439544818968559619 +target = sha3 + +aarch64 = sha512su0 +link-aarch64 = llvm.aarch64.crypto.sha512su0 +generate uint64x2_t + +/// SHA512 schedule update 1 +name = vsha512su1 +a = 1, 2 +b = 3, 4 +c = 5, 6 +validate 105553116266526, 140737488355368 +target = sha3 + +aarch64 = sha512su1 +link-aarch64 = llvm.aarch64.crypto.sha512su1 +generate uint64x2_t + +/// Floating-point round to 32-bit integer, using current rounding mode +name = vrnd32x +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 2.0, -2.0, -2.0 +target = frintts + +aarch64 = frint32x +link-aarch64 = frint32x._EXT_ +generate float32x2_t, float32x4_t + +/// Floating-point round to 32-bit integer toward zero +name = vrnd32z +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 1.0, -1.0, -2.0 +target = frintts + +aarch64 = frint32z +link-aarch64 = frint32z._EXT_ +generate float32x2_t, float32x4_t + +/// Floating-point round to 64-bit integer, using current rounding mode +name = vrnd64x +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 2.0, -2.0, -2.0 +target = frintts + +aarch64 = frint64x +link-aarch64 = frint64x._EXT_ +generate float32x2_t, float32x4_t + +/// Floating-point round to 64-bit integer toward zero +name = vrnd64z +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 1.0, -1.0, -2.0 +target = frintts + +aarch64 = frint64z +link-aarch64 = frint64z._EXT_ +generate float32x2_t, float32x4_t + +/// Transpose elements +name = vtrn +multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len} +multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len} +multi_fn = transmute, (a1, b1) +a = 0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30 +b = 1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31 +validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 + +aarch64 = trn +arm = vtrn +generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t +generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t +generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t +aarch64 = zip +generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t + +/// Transpose elements +name = vtrn +multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len} +multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len} +multi_fn = transmute, (a1, b1) +a = 0., 2., 2., 6. +b = 1., 3., 3., 7. +validate 0., 1., 2., 3., 2., 3., 6., 7. + +aarch64 = zip +arm = vtrn +generate float32x2_t:float32x2_t:float32x2x2_t +aarch64 = trn +generate float32x4_t:float32x4_t:float32x4x2_t + +/// Transpose vectors +name = vtrn1 +multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} +a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 + +aarch64 = trn1 +generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t + +aarch64 = zip1 +generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t + +/// Transpose vectors +name = vtrn1 +multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} +a = 0., 2., 4., 6., 8., 10., 12., 14. +b = 1., 3., 5., 7., 9., 11., 13., 15. +validate 0., 1., 4., 5., 8., 9., 12., 13. + +aarch64 = trn1 +generate float32x4_t + +aarch64 = zip1 +generate float32x2_t, float64x2_t + +/// Transpose vectors +name = vtrn2 +multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} +a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 + +aarch64 = trn2 +generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t + +aarch64 = zip2 +generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t + +/// Transpose vectors +name = vtrn2 +multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} +a = 0., 2., 4., 6., 8., 10., 12., 14. +b = 1., 3., 5., 7., 9., 11., 13., 15. +validate 2., 3., 6., 7., 10., 11., 14., 15. + +aarch64 = trn2 +generate float32x4_t + +aarch64 = zip2 +generate float32x2_t, float64x2_t + +/// Zip vectors +name = vzip +multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len} +multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len} +multi_fn = transmute, (a0, b0) +a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + +aarch64 = zip +arm = vzip +generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t +generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t +generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t +arm = vtrn +generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t +aarch64 = ext +arm = vorr +generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t +generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t +generate poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t + +/// Zip vectors +name = vzip +multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len} +multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len} +multi_fn = transmute, (a0, b0) +a = 1., 2., 3., 4. +b = 5., 6., 7., 8. +validate 1., 5., 2., 6., 3., 7., 4., 8. + +aarch64 = zip +arm = vtrn +generate float32x2_t:float32x2_t:float32x2x2_t +aarch64 = ext +arm = vorr +generate float32x4_t:float32x4_t:float32x4x2_t + +/// Zip vectors +name = vzip1 +multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} +a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = zip1 +generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t + +/// Zip vectors +name = vzip1 +multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} +a = 0., 2., 4., 6., 8., 10., 12., 14. +b = 1., 3., 5., 7., 9., 11., 13., 15. +validate 0., 1., 2., 3., 4., 5., 6., 7. + +aarch64 = zip1 +generate float32x2_t, float32x4_t, float64x2_t + +/// Zip vectors +name = vzip2 +multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} +a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30 +b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31 +validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + +aarch64 = zip2 +generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t + +/// Zip vectors +name = vzip2 +multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} +a = 0., 8., 8., 10., 8., 10., 12., 14. +b = 1., 9., 9., 11., 9., 11., 13., 15. +validate 8., 9., 10., 11., 12., 13., 14., 15. + +aarch64 = zip2 +generate float32x2_t, float32x4_t, float64x2_t + +/// Unzip vectors +name = vuzp +multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len} +multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len} +multi_fn = transmute, (a0, b0) +a = 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16 +b = 2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32 +validate 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32 + +aarch64 = uzp +arm = vuzp +generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t +generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t +generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t +aarch64 = zip +arm = vtrn +generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t + +/// Unzip vectors +name = vuzp +multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len} +multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len} +multi_fn = transmute, (a0, b0) +a = 1., 2., 2., 4. +b = 2., 6., 6., 8. +validate 1., 2., 2., 6., 2., 4., 6., 8. + +aarch64 = zip +arm = vtrn +generate float32x2_t:float32x2_t:float32x2x2_t +aarch64 = uzp +arm = vuzp +generate float32x4_t:float32x4_t:float32x4x2_t + +/// Unzip vectors +name = vuzp1 +multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} +a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0 +b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0 +validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16 + +aarch64 = uzp1 +generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t + +aarch64 = zip1 +generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t + +/// Unzip vectors +name = vuzp1 +multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} +a = 0., 8., 1., 9., 4., 12., 5., 13. +b = 1., 10., 3., 11., 6., 14., 7., 15. +validate 0., 1., 1., 3., 4., 5., 6., 7. + +aarch64 = uzp1 +generate float32x4_t + +aarch64 = zip1 +generate float32x2_t, float64x2_t + +/// Unzip vectors +name = vuzp2 +multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} +a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24 +b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32 +validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32 + +aarch64 = uzp2 +generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t + +aarch64 = zip2 +generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t + +/// Unzip vectors +name = vuzp2 +multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} +a = 0., 8., 1., 9., 4., 12., 5., 13. +b = 2., 9., 3., 11., 6., 14., 7., 15. +validate 8., 9., 9., 11., 12., 13., 14., 15. + +aarch64 = uzp2 +generate float32x4_t + +aarch64 = zip2 +generate float32x2_t, float64x2_t + +//////////////////// +// Unsigned Absolute difference and Accumulate Long +//////////////////// + +/// Unsigned Absolute difference and Accumulate Long +name = vabal +multi_fn = vabd-unsigned-noext, b, c, d:in_t +multi_fn = simd_add, a, {simd_cast, d} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 +validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 + +arm = vabal.s +aarch64 = uabal +generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t + +/// Unsigned Absolute difference and Accumulate Long +name = vabal_high +no-q +multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = vabd_u8, d, e, f:uint8x8_t +multi_fn = simd_add, a, {simd_cast, f} +a = 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 +validate 20, 20, 20, 20, 20, 20, 20, 20 + +aarch64 = uabal +generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t + +/// Unsigned Absolute difference and Accumulate Long +name = vabal_high +no-q +multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7] +multi_fn = vabd_u16, d, e, f:uint16x4_t +multi_fn = simd_add, a, {simd_cast, f} +a = 9, 10, 11, 12 +b = 1, 2, 3, 4, 9, 10, 11, 12 +c = 10, 10, 10, 10, 20, 0, 2, 4 +validate 20, 20, 20, 20 + +aarch64 = uabal +generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t + +/// Unsigned Absolute difference and Accumulate Long +name = vabal_high +no-q +multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] +multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3] +multi_fn = vabd_u32, d, e, f:uint32x2_t +multi_fn = simd_add, a, {simd_cast, f} +a = 15, 16 +b = 1, 2, 15, 16 +c = 10, 10, 10, 12 +validate 20, 20 + +aarch64 = uabal +generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + +//////////////////// +// Signed Absolute difference and Accumulate Long +//////////////////// + +/// Signed Absolute difference and Accumulate Long +name = vabal +multi_fn = vabd-signed-noext, b, c, d:int8x8_t +multi_fn = simd_cast, e:uint8x8_t, d +multi_fn = simd_add, a, {simd_cast, e} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 +validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 + +arm = vabal.s +aarch64 = sabal +generate int16x8_t:int8x8_t:int8x8_t:int16x8_t + +/// Signed Absolute difference and Accumulate Long +name = vabal +multi_fn = vabd-signed-noext, b, c, d:int16x4_t +multi_fn = simd_cast, e:uint16x4_t, d +multi_fn = simd_add, a, {simd_cast, e} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 +validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 + +arm = vabal.s +aarch64 = sabal +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t + +/// Signed Absolute difference and Accumulate Long +name = vabal +multi_fn = vabd-signed-noext, b, c, d:int32x2_t +multi_fn = simd_cast, e:uint32x2_t, d +multi_fn = simd_add, a, {simd_cast, e} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 +validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 + +arm = vabal.s +aarch64 = sabal +generate int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Signed Absolute difference and Accumulate Long +name = vabal_high +no-q +multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] +multi_fn = vabd_s8, d, e, f:int8x8_t +multi_fn = simd_cast, f:uint8x8_t, f +multi_fn = simd_add, a, {simd_cast, f} +a = 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 +validate 20, 20, 20, 20, 20, 20, 20, 20 + +aarch64 = sabal +generate int16x8_t:int8x16_t:int8x16_t:int16x8_t + +/// Signed Absolute difference and Accumulate Long +name = vabal_high +no-q +multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] +multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7] +multi_fn = vabd_s16, d, e, f:int16x4_t +multi_fn = simd_cast, f:uint16x4_t, f +multi_fn = simd_add, a, {simd_cast, f} +a = 9, 10, 11, 12 +b = 1, 2, 3, 4, 9, 10, 11, 12 +c = 10, 10, 10, 10, 20, 0, 2, 4 +validate 20, 20, 20, 20 + +aarch64 = sabal +generate int32x4_t:int16x8_t:int16x8_t:int32x4_t + +/// Signed Absolute difference and Accumulate Long +name = vabal_high +no-q +multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] +multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3] +multi_fn = vabd_s32, d, e, f:int32x2_t +multi_fn = simd_cast, f:uint32x2_t, f +multi_fn = simd_add, a, {simd_cast, f} +a = 15, 16 +b = 1, 2, 15, 16 +c = 10, 10, 10, 12 +validate 20, 20 + +aarch64 = sabal +generate int64x2_t:int32x4_t:int32x4_t:int64x2_t + +//////////////////// +// Singned saturating Absolute value +//////////////////// + +/// Singned saturating Absolute value +name = vqabs +a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5 +validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5 + +arm = vqabs.s +aarch64 = sqabs +link-arm = vqabs._EXT_ +link-aarch64 = sqabs._EXT_ +generate int*_t + +/// Singned saturating Absolute value +name = vqabs +a = MIN, -7 +validate MAX, 7 + +aarch64 = sqabs +link-aarch64 = sqabs._EXT_ +generate int64x*_t + +/// Signed saturating absolute value +name = vqabs +multi_fn = simd_extract, {vqabs-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 +a = -7 +validate 7 + +aarch64 = sqabs +generate i8:i8, i16:i16 + +/// Signed saturating absolute value +name = vqabs +a = -7 +validate 7 + +aarch64 = sqabs +link-aarch64 = sqabs._EXT_ +generate i32:i32, i64:i64 + +/// Shift left and insert +name = vsli +n-suffix +constn = N +multi_fn = static_assert-N-0-63 +multi_fn = transmute, {vsli_n-in_ntt-::<N>, transmute(a), transmute(b)} +a = 333 +b = 2042 +n = 2 +validate 8169 + +aarch64 = sli +generate i64, u64 + +/// Shift right and insert +name = vsri +n-suffix +constn = N +multi_fn = static_assert-N-1-bits +multi_fn = transmute, {vsri_n-in_ntt-::<N>, transmute(a), transmute(b)} +a = 333 +b = 2042 +n = 2 +validate 510 + +aarch64 = sri +generate i64, u64 diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs new file mode 100644 index 000000000..a2ae250a7 --- /dev/null +++ b/library/stdarch/crates/stdarch-gen/src/main.rs @@ -0,0 +1,3391 @@ +use self::Suffix::*; +use self::TargetFeature::*; +use std::env; +use std::fs::File; +use std::io::prelude::*; +use std::io::{self, BufReader}; +use std::path::PathBuf; + +const IN: &str = "neon.spec"; +const ARM_OUT: &str = "generated.rs"; +const AARCH64_OUT: &str = "generated.rs"; + +const UINT_TYPES: [&str; 6] = [ + "uint8x8_t", + "uint8x16_t", + "uint16x4_t", + "uint16x8_t", + "uint32x2_t", + "uint32x4_t", +]; + +const UINT_TYPES_64: [&str; 2] = ["uint64x1_t", "uint64x2_t"]; + +const INT_TYPES: [&str; 6] = [ + "int8x8_t", + "int8x16_t", + "int16x4_t", + "int16x8_t", + "int32x2_t", + "int32x4_t", +]; + +const INT_TYPES_64: [&str; 2] = ["int64x1_t", "int64x2_t"]; + +const FLOAT_TYPES: [&str; 2] = [ + //"float8x8_t", not supported by rust + //"float8x16_t", not supported by rust + //"float16x4_t", not supported by rust + //"float16x8_t", not supported by rust + "float32x2_t", + "float32x4_t", +]; + +const FLOAT_TYPES_64: [&str; 2] = [ + //"float8x8_t", not supported by rust + //"float8x16_t", not supported by rust + //"float16x4_t", not supported by rust + //"float16x8_t", not supported by rust + "float64x1_t", + "float64x2_t", +]; + +fn type_len(t: &str) -> usize { + let s: Vec<_> = t.split("x").collect(); + if s.len() == 2 { + match &s[1][0..2] { + "1_" => 1, + "2_" => 2, + "4_" => 4, + "8_" => 8, + "16" => 16, + _ => panic!("unknown type: {}", t), + } + } else if s.len() == 3 { + s[1].parse::<usize>().unwrap() * type_sub_len(t) + } else { + 1 + } +} + +fn type_sub_len(t: &str) -> usize { + let s: Vec<_> = t.split('x').collect(); + if s.len() != 3 { + 1 + } else { + match s[2] { + "2_t" => 2, + "3_t" => 3, + "4_t" => 4, + _ => panic!("unknown type len: {}", t), + } + } +} + +fn type_bits(t: &str) -> usize { + match t { + "int8x8_t" | "int8x16_t" | "uint8x8_t" | "uint8x16_t" | "poly8x8_t" | "poly8x16_t" + | "i8" | "u8" => 8, + "int16x4_t" | "int16x8_t" | "uint16x4_t" | "uint16x8_t" | "poly16x4_t" | "poly16x8_t" + | "i16" | "u16" => 16, + "int32x2_t" | "int32x4_t" | "uint32x2_t" | "uint32x4_t" | "i32" | "u32" | "float32x2_t" + | "float32x4_t" | "f32" => 32, + "int64x1_t" | "int64x2_t" | "uint64x1_t" | "uint64x2_t" | "poly64x1_t" | "poly64x2_t" + | "i64" | "u64" | "float64x1_t" | "float64x2_t" | "f64" => 64, + _ => panic!("unknown type: {}", t), + } +} + +fn type_exp_len(t: &str, base_len: usize) -> usize { + let t = type_to_sub_type(t); + let len = type_len(&t) / base_len; + match len { + 1 => 0, + 2 => 1, + 4 => 2, + 8 => 3, + 16 => 4, + _ => panic!("unknown type: {}", t), + } +} + +fn type_bits_exp_len(t: &str) -> usize { + match t { + "int8x8_t" | "int8x16_t" | "uint8x8_t" | "uint8x16_t" | "poly8x8_t" | "poly8x16_t" + | "i8" | "u8" => 3, + "int16x4_t" | "int16x8_t" | "uint16x4_t" | "uint16x8_t" | "poly16x4_t" | "poly16x8_t" + | "i16" | "u16" => 4, + "int32x2_t" | "int32x4_t" | "uint32x2_t" | "uint32x4_t" | "i32" | "u32" => 5, + "int64x1_t" | "int64x2_t" | "uint64x1_t" | "uint64x2_t" | "poly64x1_t" | "poly64x2_t" + | "i64" | "u64" => 6, + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_suffix(t: &str) -> &str { + match t { + "int8x8_t" => "_s8", + "int8x16_t" => "q_s8", + "int16x4_t" => "_s16", + "int16x8_t" => "q_s16", + "int32x2_t" => "_s32", + "int32x4_t" => "q_s32", + "int64x1_t" => "_s64", + "int64x2_t" => "q_s64", + "uint8x8_t" => "_u8", + "uint8x16_t" => "q_u8", + "uint16x4_t" => "_u16", + "uint16x8_t" => "q_u16", + "uint32x2_t" => "_u32", + "uint32x4_t" => "q_u32", + "uint64x1_t" => "_u64", + "uint64x2_t" => "q_u64", + "float16x4_t" => "_f16", + "float16x8_t" => "q_f16", + "float32x2_t" => "_f32", + "float32x4_t" => "q_f32", + "float64x1_t" => "_f64", + "float64x2_t" => "q_f64", + "poly8x8_t" => "_p8", + "poly8x16_t" => "q_p8", + "poly16x4_t" => "_p16", + "poly16x8_t" => "q_p16", + "poly64x1_t" => "_p64", + "poly64x2_t" => "q_p64", + "int8x8x2_t" => "_s8_x2", + "int8x8x3_t" => "_s8_x3", + "int8x8x4_t" => "_s8_x4", + "int16x4x2_t" => "_s16_x2", + "int16x4x3_t" => "_s16_x3", + "int16x4x4_t" => "_s16_x4", + "int32x2x2_t" => "_s32_x2", + "int32x2x3_t" => "_s32_x3", + "int32x2x4_t" => "_s32_x4", + "int64x1x2_t" => "_s64_x2", + "int64x1x3_t" => "_s64_x3", + "int64x1x4_t" => "_s64_x4", + "uint8x8x2_t" => "_u8_x2", + "uint8x8x3_t" => "_u8_x3", + "uint8x8x4_t" => "_u8_x4", + "uint16x4x2_t" => "_u16_x2", + "uint16x4x3_t" => "_u16_x3", + "uint16x4x4_t" => "_u16_x4", + "uint32x2x2_t" => "_u32_x2", + "uint32x2x3_t" => "_u32_x3", + "uint32x2x4_t" => "_u32_x4", + "uint64x1x2_t" => "_u64_x2", + "uint64x1x3_t" => "_u64_x3", + "uint64x1x4_t" => "_u64_x4", + "poly8x8x2_t" => "_p8_x2", + "poly8x8x3_t" => "_p8_x3", + "poly8x8x4_t" => "_p8_x4", + "poly16x4x2_t" => "_p16_x2", + "poly16x4x3_t" => "_p16_x3", + "poly16x4x4_t" => "_p16_x4", + "poly64x1x2_t" => "_p64_x2", + "poly64x1x3_t" => "_p64_x3", + "poly64x1x4_t" => "_p64_x4", + "float32x2x2_t" => "_f32_x2", + "float32x2x3_t" => "_f32_x3", + "float32x2x4_t" => "_f32_x4", + "float64x1x2_t" => "_f64_x2", + "float64x1x3_t" => "_f64_x3", + "float64x1x4_t" => "_f64_x4", + "int8x16x2_t" => "q_s8_x2", + "int8x16x3_t" => "q_s8_x3", + "int8x16x4_t" => "q_s8_x4", + "int16x8x2_t" => "q_s16_x2", + "int16x8x3_t" => "q_s16_x3", + "int16x8x4_t" => "q_s16_x4", + "int32x4x2_t" => "q_s32_x2", + "int32x4x3_t" => "q_s32_x3", + "int32x4x4_t" => "q_s32_x4", + "int64x2x2_t" => "q_s64_x2", + "int64x2x3_t" => "q_s64_x3", + "int64x2x4_t" => "q_s64_x4", + "uint8x16x2_t" => "q_u8_x2", + "uint8x16x3_t" => "q_u8_x3", + "uint8x16x4_t" => "q_u8_x4", + "uint16x8x2_t" => "q_u16_x2", + "uint16x8x3_t" => "q_u16_x3", + "uint16x8x4_t" => "q_u16_x4", + "uint32x4x2_t" => "q_u32_x2", + "uint32x4x3_t" => "q_u32_x3", + "uint32x4x4_t" => "q_u32_x4", + "uint64x2x2_t" => "q_u64_x2", + "uint64x2x3_t" => "q_u64_x3", + "uint64x2x4_t" => "q_u64_x4", + "poly8x16x2_t" => "q_p8_x2", + "poly8x16x3_t" => "q_p8_x3", + "poly8x16x4_t" => "q_p8_x4", + "poly16x8x2_t" => "q_p16_x2", + "poly16x8x3_t" => "q_p16_x3", + "poly16x8x4_t" => "q_p16_x4", + "poly64x2x2_t" => "q_p64_x2", + "poly64x2x3_t" => "q_p64_x3", + "poly64x2x4_t" => "q_p64_x4", + "float32x4x2_t" => "q_f32_x2", + "float32x4x3_t" => "q_f32_x3", + "float32x4x4_t" => "q_f32_x4", + "float64x2x2_t" => "q_f64_x2", + "float64x2x3_t" => "q_f64_x3", + "float64x2x4_t" => "q_f64_x4", + "i8" => "b_s8", + "i16" => "h_s16", + "i32" => "s_s32", + "i64" => "d_s64", + "u8" => "b_u8", + "u16" => "h_u16", + "u32" => "s_u32", + "u64" => "d_u64", + "f32" => "s_f32", + "f64" => "d_f64", + "p8" => "b_p8", + "p16" => "h_p16", + "p128" => "q_p128", + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_dup_suffix(t: &str) -> String { + let s: Vec<_> = type_to_suffix(t).split('_').collect(); + assert_eq!(s.len(), 2); + format!("{}_dup_{}", s[0], s[1]) +} + +fn type_to_lane_suffix(t: &str) -> String { + let s: Vec<_> = type_to_suffix(t).split('_').collect(); + assert_eq!(s.len(), 2); + format!("{}_lane_{}", s[0], s[1]) +} + +fn type_to_n_suffix(t: &str) -> &str { + match t { + "int8x8_t" => "_n_s8", + "int8x16_t" => "q_n_s8", + "int16x4_t" => "_n_s16", + "int16x8_t" => "q_n_s16", + "int32x2_t" => "_n_s32", + "int32x4_t" => "q_n_s32", + "int64x1_t" => "_n_s64", + "int64x2_t" => "q_n_s64", + "uint8x8_t" => "_n_u8", + "uint8x16_t" => "q_n_u8", + "uint16x4_t" => "_n_u16", + "uint16x8_t" => "q_n_u16", + "uint32x2_t" => "_n_u32", + "uint32x4_t" => "q_n_u32", + "uint64x1_t" => "_n_u64", + "uint64x2_t" => "q_n_u64", + "float16x4_t" => "_n_f16", + "float16x8_t" => "q_n_f16", + "float32x2_t" => "_n_f32", + "float32x4_t" => "q_n_f32", + "float64x1_t" => "_n_f64", + "float64x2_t" => "q_n_f64", + "poly8x8_t" => "_n_p8", + "poly8x16_t" => "q_n_p8", + "poly16x4_t" => "_n_p16", + "poly16x8_t" => "q_n_p16", + "poly64x1_t" => "_n_p64", + "poly64x2_t" => "q_n_p64", + "i8" => "b_n_s8", + "i16" => "h_n_s16", + "i32" => "s_n_s32", + "i64" => "d_n_s64", + "u8" => "b_n_u8", + "u16" => "h_n_u16", + "u32" => "s_n_u32", + "u64" => "d_n_u64", + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_noq_n_suffix(t: &str) -> &str { + match t { + "int8x8_t" | "int8x16_t" => "_n_s8", + "int16x4_t" | "int16x8_t" => "_n_s16", + "int32x2_t" | "int32x4_t" => "_n_s32", + "int64x1_t" | "int64x2_t" => "_n_s64", + "uint8x8_t" | "uint8x16_t" => "_n_u8", + "uint16x4_t" | "uint16x8_t" => "_n_u16", + "uint32x2_t" | "uint32x4_t" => "_n_u32", + "uint64x1_t" | "uint64x2_t" => "_n_u64", + "float16x4_t" | "float16x8_t" => "_n_f16", + "float32x2_t" | "float32x4_t" => "_n_f32", + "float64x1_t" | "float64x2_t" => "_n_f64", + "poly8x8_t" | "poly8x16_t" => "_n_p8", + "poly16x4_t" | "poly16x8_t" => "_n_p16", + "poly64x1_t" | "poly64x2_t" => "_n_p64", + "i8" => "b_n_s8", + "i16" => "h_n_s16", + "i32" => "s_n_s32", + "i64" => "d_n_s64", + "u8" => "b_n_u8", + "u16" => "h_n_u16", + "u32" => "s_n_u32", + "u64" => "d_n_u64", + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_lane_suffixes<'a>(out_t: &'a str, in_t: &'a str, re_to_out: bool) -> String { + let mut str = String::new(); + let suf = type_to_suffix(out_t); + if !suf.starts_with("_") { + str.push_str(&suf[0..1]); + } + str.push_str("_lane"); + if !re_to_out { + str.push_str(type_to_suffix(in_t)); + } else { + if type_to_suffix(in_t).starts_with("q") { + str.push_str("q"); + }; + let suf2 = type_to_noq_suffix(out_t); + str.push_str(suf2); + } + str +} + +fn type_to_rot_suffix(c_name: &str, suf: &str) -> String { + let ns: Vec<_> = c_name.split('_').collect(); + assert_eq!(ns.len(), 2); + if suf.starts_with("q") { + format!("{}q_{}{}", ns[0], ns[1], &suf[1..]) + } else { + format!("{}{}", c_name, suf) + } +} + +fn type_to_signed(t: &str) -> String { + let s = t.replace("uint", "int"); + let s = s.replace("poly", "int"); + s +} + +fn type_to_unsigned(t: &str) -> String { + if t.contains("uint") { + return t.to_string(); + } + let s = t.replace("int", "uint"); + let s = s.replace("poly", "uint"); + s +} + +fn type_to_double_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String { + let mut str = String::new(); + let suf = type_to_suffix(in_t); + if suf.starts_with("q") && type_to_suffix(out_t).starts_with("q") { + str.push_str("q"); + } + if !suf.starts_with("_") && !suf.starts_with("q") { + str.push_str(&suf[0..1]); + } + str.push_str(type_to_noq_suffix(out_t)); + str.push_str(type_to_noq_suffix(in_t)); + str +} + +fn type_to_double_n_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String { + let mut str = String::new(); + let suf = type_to_suffix(in_t); + if suf.starts_with("q") && type_to_suffix(out_t).starts_with("q") { + str.push_str("q"); + } + if !suf.starts_with("_") && !suf.starts_with("q") { + str.push_str(&suf[0..1]); + } + str.push_str("_n"); + str.push_str(type_to_noq_suffix(out_t)); + str.push_str(type_to_noq_suffix(in_t)); + str +} + +fn type_to_noq_double_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String { + let mut str = String::new(); + str.push_str(type_to_noq_suffix(out_t)); + str.push_str(type_to_noq_suffix(in_t)); + str +} + +fn type_to_noq_suffix(t: &str) -> &str { + match t { + "int8x8_t" | "int8x16_t" | "i8" => "_s8", + "int16x4_t" | "int16x8_t" | "i16" => "_s16", + "int32x2_t" | "int32x4_t" | "i32" => "_s32", + "int64x1_t" | "int64x2_t" | "i64" => "_s64", + "uint8x8_t" | "uint8x16_t" | "u8" => "_u8", + "uint16x4_t" | "uint16x8_t" | "u16" => "_u16", + "uint32x2_t" | "uint32x4_t" | "u32" => "_u32", + "uint64x1_t" | "uint64x2_t" | "u64" => "_u64", + "float16x4_t" | "float16x8_t" => "_f16", + "float32x2_t" | "float32x4_t" | "f32" => "_f32", + "float64x1_t" | "float64x2_t" | "f64" => "_f64", + "poly8x8_t" | "poly8x16_t" => "_p8", + "poly16x4_t" | "poly16x8_t" => "_p16", + "poly64x1_t" | "poly64x2_t" | "p64" => "_p64", + "p128" => "_p128", + _ => panic!("unknown type: {}", t), + } +} + +#[derive(Clone, Copy)] +enum Suffix { + Normal, + Double, + NoQ, + NoQDouble, + NSuffix, + DoubleN, + NoQNSuffix, + OutSuffix, + OutNSuffix, + OutNox, + In1Nox, + OutDupNox, + OutLaneNox, + In1LaneNox, + Lane, + In2, + In2Lane, + OutLane, + Rot, + RotLane, +} + +#[derive(Clone, Copy)] +enum TargetFeature { + Default, + ArmV7, + Vfp4, + FPArmV8, + AES, + FCMA, + Dotprod, + I8MM, + SHA3, + RDM, + SM4, + FTTS, +} + +#[derive(Clone, Copy)] +enum Fntype { + Normal, + Load, + Store, +} + +fn type_to_global_type(t: &str) -> &str { + match t { + "int8x8_t" | "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t" => "i8x8", + "int8x16_t" | "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "i8x16", + "int16x4_t" | "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t" => "i16x4", + "int16x8_t" | "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "i16x8", + "int32x2_t" | "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t" => "i32x2", + "int32x4_t" | "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "i32x4", + "int64x1_t" | "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t" => "i64x1", + "int64x2_t" | "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "i64x2", + "uint8x8_t" | "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t" => "u8x8", + "uint8x16_t" | "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "u8x16", + "uint16x4_t" | "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t" => "u16x4", + "uint16x8_t" | "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "u16x8", + "uint32x2_t" | "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t" => "u32x2", + "uint32x4_t" | "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "u32x4", + "uint64x1_t" | "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t" => "u64x1", + "uint64x2_t" | "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "u64x2", + "float16x4_t" => "f16x4", + "float16x8_t" => "f16x8", + "float32x2_t" | "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t" => "f32x2", + "float32x4_t" | "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "f32x4", + "float64x1_t" | "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t" => "f64", + "float64x2_t" | "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "f64x2", + "poly8x8_t" | "poly8x8x2_t" | "poly8x8x3_t" | "poly8x8x4_t" => "i8x8", + "poly8x16_t" | "poly8x16x2_t" | "poly8x16x3_t" | "poly8x16x4_t" => "i8x16", + "poly16x4_t" | "poly16x4x2_t" | "poly16x4x3_t" | "poly16x4x4_t" => "i16x4", + "poly16x8_t" | "poly16x8x2_t" | "poly16x8x3_t" | "poly16x8x4_t" => "i16x8", + "poly64x1_t" | "poly64x1x2_t" | "poly64x1x3_t" | "poly64x1x4_t" => "i64x1", + "poly64x2_t" | "poly64x2x2_t" | "poly64x2x3_t" | "poly64x2x4_t" => "i64x2", + "i8" => "i8", + "i16" => "i16", + "i32" => "i32", + "i64" => "i64", + "u8" => "u8", + "u16" => "u16", + "u32" => "u32", + "u64" => "u64", + "f32" => "f32", + "f64" => "f64", + "p8" => "p8", + "p16" => "p16", + "p64" => "p64", + "p128" => "p128", + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_sub_type(t: &str) -> String { + let s: Vec<_> = t.split('x').collect(); + match s.len() { + 2 => String::from(t), + 3 => format!("{}x{}_t", s[0], s[1]), + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_native_type(t: &str) -> String { + let s: Vec<_> = t.split('x').collect(); + match s.len() { + 1 => { + assert!(t.contains("*const") || t.contains("*mut")); + let sub: Vec<_> = t.split(' ').collect(); + String::from(sub[1]) + } + 2 | 3 => match &s[0][0..3] { + "int" => format!("i{}", &s[0][3..]), + "uin" => format!("u{}", &s[0][4..]), + "flo" => format!("f{}", &s[0][5..]), + "pol" => format!("u{}", &s[0][4..]), + _ => panic!("unknown type: {}", t), + }, + _ => panic!("unknown type: {}", t), + } +} + +fn native_type_to_type(t: &str) -> &str { + match t { + "i8" => "int8x8_t", + "i16" => "int16x4_t", + "i32" => "int32x2_t", + "i64" => "int64x1_t", + "u8" => "uint8x8_t", + "u16" => "uint16x4_t", + "u32" => "uint32x2_t", + "u64" => "uint64x1_t", + "f16" => "float16x4_t", + "f32" => "float32x2_t", + "f64" => "float64x1_t", + _ => panic!("unknown type: {}", t), + } +} + +fn native_type_to_long_type(t: &str) -> &str { + match t { + "i8" => "int8x16_t", + "i16" => "int16x8_t", + "i32" => "int32x4_t", + "i64" => "int64x2_t", + "u8" => "uint8x16_t", + "u16" => "uint16x8_t", + "u32" => "uint32x4_t", + "u64" => "uint64x2_t", + "f16" => "float16x8_t", + "f32" => "float32x4_t", + "f64" => "float64x2_t", + _ => panic!("unknown type: {}", t), + } +} + +fn type_to_half(t: &str) -> &str { + match t { + "int8x16_t" => "int8x8_t", + "int16x8_t" => "int16x4_t", + "int32x4_t" => "int32x2_t", + "int64x2_t" => "int64x1_t", + "uint8x16_t" => "uint8x8_t", + "uint16x8_t" => "uint16x4_t", + "uint32x4_t" => "uint32x2_t", + "uint64x2_t" => "uint64x1_t", + "poly8x16_t" => "poly8x8_t", + "poly16x8_t" => "poly16x4_t", + "float32x4_t" => "float32x2_t", + "float64x2_t" => "float64x1_t", + _ => panic!("unknown half type for {}", t), + } +} + +fn asc(start: i32, len: usize) -> String { + let mut s = String::from("["); + for i in 0..len { + if i != 0 { + s.push_str(", "); + } + let n = start + i as i32; + s.push_str(&n.to_string()); + } + s.push_str("]"); + s +} + +fn transpose1(x: usize) -> &'static str { + match x { + 2 => "[0, 2]", + 4 => "[0, 4, 2, 6]", + 8 => "[0, 8, 2, 10, 4, 12, 6, 14]", + 16 => "[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]", + _ => panic!("unknown transpose order of len {}", x), + } +} + +fn transpose2(x: usize) -> &'static str { + match x { + 2 => "[1, 3]", + 4 => "[1, 5, 3, 7]", + 8 => "[1, 9, 3, 11, 5, 13, 7, 15]", + 16 => "[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]", + _ => panic!("unknown transpose order of len {}", x), + } +} + +fn zip1(x: usize) -> &'static str { + match x { + 2 => "[0, 2]", + 4 => "[0, 4, 1, 5]", + 8 => "[0, 8, 1, 9, 2, 10, 3, 11]", + 16 => "[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]", + _ => panic!("unknown zip order of len {}", x), + } +} + +fn zip2(x: usize) -> &'static str { + match x { + 2 => "[1, 3]", + 4 => "[2, 6, 3, 7]", + 8 => "[4, 12, 5, 13, 6, 14, 7, 15]", + 16 => "[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]", + _ => panic!("unknown zip order of len {}", x), + } +} + +fn unzip1(x: usize) -> &'static str { + match x { + 2 => "[0, 2]", + 4 => "[0, 2, 4, 6]", + 8 => "[0, 2, 4, 6, 8, 10, 12, 14]", + 16 => "[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]", + _ => panic!("unknown unzip order of len {}", x), + } +} + +fn unzip2(x: usize) -> &'static str { + match x { + 2 => "[1, 3]", + 4 => "[1, 3, 5, 7]", + 8 => "[1, 3, 5, 7, 9, 11, 13, 15]", + 16 => "[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]", + _ => panic!("unknown unzip order of len {}", x), + } +} + +fn values(t: &str, vs: &[String]) -> String { + if vs.len() == 1 && !t.contains('x') { + format!(": {} = {}", t, vs[0]) + } else if vs.len() == 1 && type_to_global_type(t) == "f64" { + format!(": {} = {}", type_to_global_type(t), vs[0]) + } else { + let s: Vec<_> = t.split('x').collect(); + if s.len() == 3 { + format!( + ": [{}; {}] = [{}]", + type_to_native_type(t), + type_len(t), + vs.iter() + .map(|v| map_val(type_to_global_type(t), v)) + //.map(|v| format!("{}{}", v, type_to_native_type(t))) + .collect::<Vec<_>>() + .join(", ") + ) + } else { + format!( + ": {} = {}::new({})", + type_to_global_type(t), + type_to_global_type(t), + vs.iter() + .map(|v| map_val(type_to_global_type(t), v)) + //.map(|v| format!("{}{}", v, type_to_native_type(t))) + .collect::<Vec<_>>() + .join(", ") + ) + } + } +} + +fn max_val(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "0xFF", + "u16" => "0xFF_FF", + "u32" => "0xFF_FF_FF_FF", + "u64" => "0xFF_FF_FF_FF_FF_FF_FF_FF", + "i8x" => "0x7F", + "i16" => "0x7F_FF", + "i32" => "0x7F_FF_FF_FF", + "i64" => "0x7F_FF_FF_FF_FF_FF_FF_FF", + "f32" => "3.40282347e+38", + "f64" => "1.7976931348623157e+308", + _ => panic!("No TRUE for type {}", t), + } +} + +fn min_val(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "0", + "u16" => "0", + "u32" => "0", + "u64" => "0", + "i8x" => "-128", + "i16" => "-32768", + "i32" => "-2147483648", + "i64" => "-9223372036854775808", + "f32" => "-3.40282347e+38", + "f64" => "-1.7976931348623157e+308", + _ => panic!("No TRUE for type {}", t), + } +} + +fn true_val(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "0xFF", + "u16" => "0xFF_FF", + "u32" => "0xFF_FF_FF_FF", + "u64" => "0xFF_FF_FF_FF_FF_FF_FF_FF", + _ => panic!("No TRUE for type {}", t), + } +} + +fn ff_val(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "0xFF", + "u16" => "0xFF_FF", + "u32" => "0xFF_FF_FF_FF", + "u64" => "0xFF_FF_FF_FF_FF_FF_FF_FF", + "i8x" => "0xFF", + "i16" => "0xFF_FF", + "i32" => "0xFF_FF_FF_FF", + "i64" => "0xFF_FF_FF_FF_FF_FF_FF_FF", + _ => panic!("No TRUE for type {}", t), + } +} + +fn false_val(_t: &str) -> &'static str { + "0" +} + +fn bits(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "8", + "u16" => "16", + "u32" => "32", + "u64" => "64", + "i8x" => "8", + "i16" => "16", + "i32" => "32", + "i64" => "64", + "p8x" => "8", + "p16" => "16", + "p64" => "64", + _ => panic!("Unknown bits for type {}", t), + } +} + +fn bits_minus_one(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "7", + "u16" => "15", + "u32" => "31", + "u64" => "63", + "i8x" => "7", + "i16" => "15", + "i32" => "31", + "i64" => "63", + "p8x" => "7", + "p16" => "15", + "p64" => "63", + _ => panic!("Unknown bits for type {}", t), + } +} + +fn half_bits(t: &str) -> &'static str { + match &t[..3] { + "u8x" => "4", + "u16" => "8", + "u32" => "16", + "u64" => "32", + "i8x" => "4", + "i16" => "8", + "i32" => "16", + "i64" => "32", + "p8x" => "4", + "p16" => "8", + "p64" => "32", + _ => panic!("Unknown bits for type {}", t), + } +} + +fn type_len_str(t: &str) -> &'static str { + match t { + "int8x8_t" => "8", + "int8x16_t" => "16", + "int16x4_t" => "4", + "int16x8_t" => "8", + "int32x2_t" => "2", + "int32x4_t" => "4", + "int64x1_t" => "1", + "int64x2_t" => "2", + "uint8x8_t" => "8", + "uint8x16_t" => "16", + "uint16x4_t" => "4", + "uint16x8_t" => "8", + "uint32x2_t" => "2", + "uint32x4_t" => "4", + "uint64x1_t" => "1", + "uint64x2_t" => "2", + "float16x4_t" => "4", + "float16x8_t" => "8", + "float32x2_t" => "2", + "float32x4_t" => "4", + "float64x1_t" => "1", + "float64x2_t" => "2", + "poly8x8_t" => "8", + "poly8x16_t" => "16", + "poly16x4_t" => "4", + "poly16x8_t" => "8", + "poly64x1_t" => "1", + "poly64x2_t" => "2", + _ => panic!("unknown type: {}", t), + } +} + +fn type_half_len_str(t: &str) -> &'static str { + match t { + "int8x8_t" => "4", + "int8x16_t" => "8", + "int16x4_t" => "2", + "int16x8_t" => "4", + "int32x2_t" => "1", + "int32x4_t" => "2", + "int64x1_t" => "0", + "int64x2_t" => "1", + "uint8x8_t" => "4", + "uint8x16_t" => "8", + "uint16x4_t" => "2", + "uint16x8_t" => "4", + "uint32x2_t" => "1", + "uint32x4_t" => "2", + "uint64x1_t" => "0", + "uint64x2_t" => "1", + "float16x4_t" => "2", + "float16x8_t" => "4", + "float32x2_t" => "1", + "float32x4_t" => "2", + "float64x1_t" => "0", + "float64x2_t" => "1", + "poly8x8_t" => "4", + "poly8x16_t" => "8", + "poly16x4_t" => "2", + "poly16x8_t" => "4", + "poly64x1_t" => "0", + "poly64x2_t" => "1", + _ => panic!("unknown type: {}", t), + } +} + +fn map_val<'v>(t: &str, v: &'v str) -> &'v str { + match v { + "FALSE" => false_val(t), + "TRUE" => true_val(t), + "MAX" => max_val(t), + "MIN" => min_val(t), + "FF" => ff_val(t), + "BITS" => bits(t), + "BITS_M1" => bits_minus_one(t), + "HFBITS" => half_bits(t), + "LEN" => type_len_str(t), + "HFLEN" => type_half_len_str(t), + o => o, + } +} + +fn type_to_ext(t: &str, v: bool, r: bool, pi8: bool) -> String { + if !t.contains('x') { + return t.replace("u", "i"); + } + let native = type_to_native_type(t); + let sub_ext = match type_sub_len(t) { + 1 => String::new(), + _ if v => format!( + ".p0v{}{}", + &type_len(&type_to_sub_type(t)).to_string(), + native + ), + _ if pi8 => format!(".p0i8"), + _ => format!(".p0{}", native), + }; + let sub_type = match &native[0..1] { + "i" | "f" => native, + "u" => native.replace("u", "i"), + _ => panic!("unknown type: {}", t), + }; + let ext = format!( + "v{}{}{}", + &type_len(&type_to_sub_type(t)).to_string(), + sub_type, + sub_ext + ); + if r { + let ss: Vec<_> = ext.split('.').collect(); + if ss.len() != 2 { + ext + } else { + format!("{}.{}", ss[1], ss[0]) + } + } else { + ext + } +} + +fn ext(s: &str, in_t: &[&str; 3], out_t: &str) -> String { + s.replace("_EXT_", &type_to_ext(in_t[0], false, false, false)) + .replace("_EXT2_", &type_to_ext(out_t, false, false, false)) + .replace("_EXT3_", &type_to_ext(in_t[1], false, false, false)) + .replace("_EXT4_", &type_to_ext(in_t[2], false, false, false)) + .replace("_EXTr3_", &type_to_ext(in_t[1], false, true, false)) + .replace("_EXTv2_", &type_to_ext(out_t, true, false, false)) + .replace("_EXTpi8_", &type_to_ext(in_t[1], false, false, true)) + .replace("_EXTpi82_", &type_to_ext(out_t, false, false, true)) + .replace("_EXTpi8r_", &type_to_ext(in_t[1], false, true, true)) +} + +fn is_vldx(name: &str) -> bool { + let s: Vec<_> = name.split('_').collect(); + &name[0..3] == "vld" + && name[3..4].parse::<i32>().unwrap() > 1 + && (s.last().unwrap().starts_with("s") || s.last().unwrap().starts_with("f")) +} + +fn is_vstx(name: &str) -> bool { + let s: Vec<_> = name.split('_').collect(); + s.len() == 2 + && &name[0..3] == "vst" + && name[3..4].parse::<i32>().unwrap() > 1 + && (s[1].starts_with("s") || s[1].starts_with("f")) +} + +#[allow(clippy::too_many_arguments)] +fn gen_aarch64( + current_comment: &str, + current_fn: &Option<String>, + current_name: &str, + current_aarch64: &Option<String>, + link_aarch64: &Option<String>, + const_aarch64: &Option<String>, + constn: &Option<String>, + in_t: &[&str; 3], + out_t: &str, + current_tests: &[( + Vec<String>, + Vec<String>, + Vec<String>, + Option<String>, + Vec<String>, + )], + suffix: Suffix, + para_num: i32, + target: TargetFeature, + fixed: &Vec<String>, + multi_fn: &Vec<String>, + fn_type: Fntype, +) -> (String, String) { + let name = match suffix { + Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])), + NoQ => format!("{}{}", current_name, type_to_noq_suffix(in_t[1])), + Double => format!( + "{}{}", + current_name, + type_to_double_suffixes(out_t, in_t[1]) + ), + NoQDouble => format!( + "{}{}", + current_name, + type_to_noq_double_suffixes(out_t, in_t[1]) + ), + NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])), + DoubleN => format!( + "{}{}", + current_name, + type_to_double_n_suffixes(out_t, in_t[1]) + ), + NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), + OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), + OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)), + OutNox => format!( + "{}{}", + current_name, + type_to_suffix(&type_to_sub_type(out_t)) + ), + In1Nox => format!( + "{}{}", + current_name, + type_to_suffix(&type_to_sub_type(in_t[1])) + ), + OutDupNox => format!( + "{}{}", + current_name, + type_to_dup_suffix(&type_to_sub_type(out_t)) + ), + OutLaneNox => format!( + "{}{}", + current_name, + type_to_lane_suffix(&type_to_sub_type(out_t)) + ), + In1LaneNox => format!( + "{}{}", + current_name, + type_to_lane_suffix(&type_to_sub_type(in_t[1])) + ), + Lane => format!( + "{}{}", + current_name, + type_to_lane_suffixes(out_t, in_t[1], false) + ), + In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), + In2Lane => format!( + "{}{}", + current_name, + type_to_lane_suffixes(out_t, in_t[2], false) + ), + OutLane => format!( + "{}{}", + current_name, + type_to_lane_suffixes(out_t, in_t[2], true) + ), + Rot => type_to_rot_suffix(current_name, type_to_suffix(out_t)), + RotLane => type_to_rot_suffix(current_name, &type_to_lane_suffixes(out_t, in_t[2], false)), + }; + let current_target = match target { + Default => "neon", + ArmV7 => "neon", + Vfp4 => "neon", + FPArmV8 => "neon", + AES => "neon,aes", + FCMA => "neon,fcma", + Dotprod => "neon,dotprod", + I8MM => "neon,i8mm", + SHA3 => "neon,sha3", + RDM => "rdm", + SM4 => "neon,sm4", + FTTS => "neon,frintts", + }; + let current_fn = if let Some(current_fn) = current_fn.clone() { + if link_aarch64.is_some() { + panic!("[{}] Can't specify link and fn at the same time.", name) + } + current_fn + } else if link_aarch64.is_some() { + format!("{}_", name) + } else { + if multi_fn.is_empty() { + panic!( + "[{}] Either (multi) fn or link-aarch have to be specified.", + name + ) + } + String::new() + }; + let current_aarch64 = current_aarch64.clone().unwrap(); + let mut link_t: Vec<String> = vec![ + in_t[0].to_string(), + in_t[1].to_string(), + in_t[2].to_string(), + out_t.to_string(), + ]; + let mut ext_c = String::new(); + if let Some(mut link_aarch64) = link_aarch64.clone() { + if link_aarch64.contains(":") { + let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect(); + assert_eq!(links.len(), 5); + link_aarch64 = links[0].to_string(); + link_t = vec![ + links[1].clone(), + links[2].clone(), + links[3].clone(), + links[4].clone(), + ]; + } + let link_aarch64 = if link_aarch64.starts_with("llvm") { + ext(&link_aarch64, in_t, out_t) + } else { + let mut link = String::from("llvm.aarch64.neon."); + link.push_str(&link_aarch64); + ext(&link, in_t, out_t) + }; + let (ext_inputs, ext_output) = { + if const_aarch64.is_some() { + if !matches!(fn_type, Fntype::Normal) { + let ptr_type = match fn_type { + Fntype::Load => "*const i8", + Fntype::Store => "*mut i8", + _ => panic!("unsupported fn type"), + }; + let sub = type_to_sub_type(in_t[1]); + ( + match type_sub_len(in_t[1]) { + 1 => format!("a: {}, n: i64, ptr: {}", sub, ptr_type), + 2 => format!("a: {}, b: {}, n: i64, ptr: {}", sub, sub, ptr_type), + 3 => format!( + "a: {}, b: {}, c: {}, n: i64, ptr: {}", + sub, sub, sub, ptr_type + ), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}, n: i64, ptr: {}", + sub, sub, sub, sub, ptr_type + ), + _ => panic!("unsupported type: {}", in_t[1]), + }, + if out_t != "void" { + format!(" -> {}", out_t) + } else { + String::new() + }, + ) + } else { + ( + match para_num { + 1 => format!("a: {}, n: i32", in_t[0]), + 2 => format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]), + 3 => format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } + } else if matches!(fn_type, Fntype::Store) { + let sub = type_to_sub_type(in_t[1]); + let ptr_type = if is_vstx(&name) { + "i8".to_string() + } else { + type_to_native_type(in_t[1]) + }; + let subs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub), + 2 => format!("a: {}, b: {}", sub, sub), + 3 => format!("a: {}, b: {}, c: {}", sub, sub, sub), + 4 => format!("a: {}, b: {}, c: {}, d: {}", sub, sub, sub, sub), + _ => panic!("unsupported type: {}", in_t[1]), + }; + (format!("{}, ptr: *mut {}", subs, ptr_type), String::new()) + } else if is_vldx(&name) { + let ptr_type = if name.contains("dup") { + type_to_native_type(out_t) + } else { + type_to_sub_type(out_t) + }; + ( + format!("ptr: *const {}", ptr_type), + format!(" -> {}", out_t), + ) + } else { + ( + match para_num { + 1 => format!("a: {}", link_t[0]), + 2 => format!("a: {}, b: {}", link_t[0], link_t[1]), + 3 => format!("a: {}, b: {}, c: {}", link_t[0], link_t[1], link_t[2]), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", link_t[3]), + ) + } + }; + ext_c = format!( + r#"#[allow(improper_ctypes)] + extern "unadjusted" {{ + #[cfg_attr(target_arch = "aarch64", link_name = "{}")] + fn {}({}){}; + }} + "#, + link_aarch64, current_fn, ext_inputs, ext_output, + ); + }; + let const_declare = if let Some(constn) = constn { + if constn.contains(":") { + let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect(); + assert_eq!(constns.len(), 2); + format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1]) + } else { + format!(r#"<const {}: i32>"#, constn) + } + } else { + String::new() + }; + let multi_calls = if !multi_fn.is_empty() { + let mut calls = String::new(); + for i in 0..multi_fn.len() { + if i > 0 { + calls.push_str("\n "); + } + calls.push_str(&get_call( + &multi_fn[i], + current_name, + &const_declare, + in_t, + out_t, + fixed, + None, + true, + )); + } + calls + } else { + String::new() + }; + let const_assert = if let Some(constn) = constn { + if constn.contains(":") { + let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect(); + let const_test = current_tests[0].3.as_ref().unwrap(); + let const_tests: Vec<_> = const_test.split(':').map(|v| v.to_string()).collect(); + assert_eq!(constns.len(), 2); + assert_eq!(const_tests.len(), 2); + format!( + r#", {} = {}, {} = {}"#, + constns[0], + map_val(in_t[1], &const_tests[0]), + constns[1], + map_val(in_t[1], &const_tests[1]), + ) + } else { + format!( + r#", {} = {}"#, + constn, + map_val(in_t[1], current_tests[0].3.as_ref().unwrap()) + ) + } + } else { + String::new() + }; + let const_legacy = if let Some(constn) = constn { + if constn.contains(":") { + format!( + "\n#[rustc_legacy_const_generics({}, {})]", + para_num - 1, + para_num + 1 + ) + } else { + format!("\n#[rustc_legacy_const_generics({})]", para_num) + } + } else { + String::new() + }; + let fn_decl = { + let fn_output = if out_t == "void" { + String::new() + } else { + format!("-> {} ", out_t) + }; + let fn_inputs = match para_num { + 1 => format!("(a: {})", in_t[0]), + 2 => format!("(a: {}, b: {})", in_t[0], in_t[1]), + 3 => format!("(a: {}, b: {}, c: {})", in_t[0], in_t[1], in_t[2]), + _ => panic!("unsupported parameter number"), + }; + format!( + "pub unsafe fn {}{}{} {}", + name, const_declare, fn_inputs, fn_output + ) + }; + let call_params = { + if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) { + if !matches!(fn_type, Fntype::Normal) { + let subs = match type_sub_len(in_t[1]) { + 1 => "b", + 2 => "b.0, b.1", + 3 => "b.0, b.1, b.2", + 4 => "b.0, b.1, b.2, b.3", + _ => panic!("unsupported type: {}", in_t[1]), + }; + format!( + r#"{} + {}{}({}, {} as i64, a as _)"#, + multi_calls, + ext_c, + current_fn, + subs, + constn.as_deref().unwrap() + ) + } else { + match para_num { + 1 => format!( + r#"{} + {}{}(a, {})"#, + multi_calls, ext_c, current_fn, const_aarch64 + ), + 2 => format!( + r#"{} + {}{}(a, b, {})"#, + multi_calls, ext_c, current_fn, const_aarch64 + ), + _ => String::new(), + } + } + } else if link_aarch64.is_some() && matches!(fn_type, Fntype::Store) { + let cast = if is_vstx(&name) { " as _" } else { "" }; + match type_sub_len(in_t[1]) { + 1 => format!(r#"{}{}(b, a{})"#, ext_c, current_fn, cast), + 2 => format!(r#"{}{}(b.0, b.1, a{})"#, ext_c, current_fn, cast), + 3 => format!(r#"{}{}(b.0, b.1, b.2, a{})"#, ext_c, current_fn, cast), + 4 => format!(r#"{}{}(b.0, b.1, b.2, b.3, a{})"#, ext_c, current_fn, cast), + _ => panic!("unsupported type: {}", in_t[1]), + } + } else if link_aarch64.is_some() && is_vldx(&name) { + format!(r#"{}{}(a as _)"#, ext_c, current_fn,) + } else { + let trans: [&str; 2] = if link_t[3] != out_t { + ["transmute(", ")"] + } else { + ["", ""] + }; + match (multi_calls.len(), para_num, fixed.len()) { + (0, 1, 0) => format!(r#"{}{}{}(a){}"#, ext_c, trans[0], current_fn, trans[1]), + (0, 1, _) => { + let fixed: Vec<String> = + fixed.iter().take(type_len(in_t[0])).cloned().collect(); + format!( + r#"let b{}; + {}{}{}(a, transmute(b)){}"#, + values(in_t[0], &fixed), + ext_c, + trans[0], + current_fn, + trans[1], + ) + } + (0, 2, _) => format!(r#"{}{}{}(a, b){}"#, ext_c, trans[0], current_fn, trans[1],), + (0, 3, _) => format!(r#"{}{}(a, b, c)"#, ext_c, current_fn,), + (_, 1, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 2, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 3, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, _, _) => String::new(), + } + } + }; + let stable = match target { + Default | ArmV7 | Vfp4 | FPArmV8 | AES => { + String::from("\n#[stable(feature = \"neon_intrinsics\", since = \"1.59.0\")]") + } + RDM => String::from("\n#[stable(feature = \"rdm_intrinsics\", since = \"1.62.0\")]"), + _ => String::new(), + }; + let function = format!( + r#" +{} +#[inline] +#[target_feature(enable = "{}")] +#[cfg_attr(test, assert_instr({}{}))]{}{} +{}{{ + {} +}} +"#, + current_comment, + current_target, + current_aarch64, + const_assert, + const_legacy, + stable, + fn_decl, + call_params + ); + let test_target = match target { + I8MM => "neon,i8mm", + SM4 => "neon,sm4", + SHA3 => "neon,sha3", + FTTS => "neon,frintts", + _ => "neon", + }; + let test = match fn_type { + Fntype::Normal => gen_test( + &name, + in_t, + &out_t, + current_tests, + [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])], + type_len(out_t), + para_num, + test_target, + ), + Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)), + Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])), + }; + (function, test) +} + +fn gen_load_test( + name: &str, + in_t: &[&str; 3], + out_t: &str, + current_tests: &[( + Vec<String>, + Vec<String>, + Vec<String>, + Option<String>, + Vec<String>, + )], + type_len: usize, +) -> String { + let mut test = format!( + r#" + #[simd_test(enable = "neon")] + unsafe fn test_{}() {{"#, + name, + ); + for (a, b, _, n, e) in current_tests { + let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect(); + let e: Vec<String> = e.iter().take(type_len).cloned().collect(); + let has_b = b.len() > 0; + let has_n = n.is_some(); + let mut input = String::from("["); + for i in 0..type_len + 1 { + if i != 0 { + input.push_str(", "); + } + input.push_str(&a[i]) + } + input.push_str("]"); + let output = |v: &Vec<String>| { + let mut output = String::from("["); + for i in 0..type_sub_len(out_t) { + if i != 0 { + output.push_str(", "); + } + let sub_len = type_len / type_sub_len(out_t); + if type_to_global_type(out_t) != "f64" { + let mut sub_output = format!("{}::new(", type_to_global_type(out_t)); + for j in 0..sub_len { + if j != 0 { + sub_output.push_str(", "); + } + sub_output.push_str(&v[i * sub_len + j]); + } + sub_output.push_str(")"); + output.push_str(&sub_output); + } else { + output.push_str(&v[i]); + } + } + output.push_str("]"); + output + }; + let input_b = if has_b { + let b: Vec<String> = b.iter().take(type_len).cloned().collect(); + format!( + r#" + let b: [{}; {}] = {};"#, + type_to_global_type(in_t[1]), + type_sub_len(in_t[1]), + output(&b), + ) + } else { + String::new() + }; + let t = format!( + r#" + let a: [{}; {}] = {};{} + let e: [{}; {}] = {}; + let r: [{}; {}] = transmute({}{}(a[1..].as_ptr(){})); + assert_eq!(r, e); +"#, + type_to_native_type(out_t), + type_len + 1, + input, + input_b, + type_to_global_type(out_t), + type_sub_len(out_t), + output(&e), + type_to_global_type(out_t), + type_sub_len(out_t), + name, + if has_n { + format!("::<{}>", n.as_deref().unwrap()) + } else { + String::new() + }, + if has_b { ", transmute(b)" } else { "" }, + ); + test.push_str(&t); + } + test.push_str(" }\n"); + test +} + +fn gen_store_test( + name: &str, + in_t: &[&str; 3], + _out_t: &str, + current_tests: &[( + Vec<String>, + Vec<String>, + Vec<String>, + Option<String>, + Vec<String>, + )], + type_len: usize, +) -> String { + let mut test = format!( + r#" + #[simd_test(enable = "neon")] + unsafe fn test_{}() {{"#, + name, + ); + for (a, _, _, constn, e) in current_tests { + let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect(); + let e: Vec<String> = e.iter().take(type_len).cloned().collect(); + let mut input = String::from("["); + for i in 0..type_len + 1 { + if i != 0 { + input.push_str(", "); + } + input.push_str(&a[i]) + } + input.push_str("]"); + let mut output = String::from("["); + for i in 0..type_len { + if i != 0 { + output.push_str(", "); + } + output.push_str(&e[i]) + } + output.push_str("]"); + let const_n = constn + .as_deref() + .map_or(String::new(), |n| format!("::<{}>", n.to_string())); + let t = format!( + r#" + let a: [{}; {}] = {}; + let e: [{}; {}] = {}; + let mut r: [{}; {}] = [0{}; {}]; + {}{}(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _)); + assert_eq!(r, e); +"#, + type_to_native_type(in_t[1]), + type_len + 1, + input, + type_to_native_type(in_t[1]), + type_len, + output, + type_to_native_type(in_t[1]), + type_len, + type_to_native_type(in_t[1]), + type_len, + name, + const_n, + ); + test.push_str(&t); + } + test.push_str(" }\n"); + test +} + +fn gen_test( + name: &str, + in_t: &[&str; 3], + out_t: &str, + current_tests: &[( + Vec<String>, + Vec<String>, + Vec<String>, + Option<String>, + Vec<String>, + )], + len_in: [usize; 3], + len_out: usize, + para_num: i32, + target: &str, +) -> String { + let mut test = format!( + r#" + #[simd_test(enable = "{}")] + unsafe fn test_{}() {{"#, + target, name, + ); + for (a, b, c, n, e) in current_tests { + let a: Vec<String> = a.iter().take(len_in[0]).cloned().collect(); + let b: Vec<String> = b.iter().take(len_in[1]).cloned().collect(); + let c: Vec<String> = c.iter().take(len_in[2]).cloned().collect(); + let e: Vec<String> = e.iter().take(len_out).cloned().collect(); + let const_value = if let Some(constn) = n { + if constn.contains(":") { + let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect(); + format!( + r#"::<{}, {}>"#, + map_val(in_t[1], &constns[0]), + map_val(in_t[1], &constns[1]) + ) + } else { + format!(r#"::<{}>"#, map_val(in_t[1], constn)) + } + } else { + String::new() + }; + let r_type = match type_sub_len(out_t) { + 1 => type_to_global_type(out_t).to_string(), + _ => format!("[{}; {}]", type_to_native_type(out_t), type_len(out_t)), + }; + let t = { + match para_num { + 1 => { + format!( + r#" + let a{}; + let e{}; + let r: {} = transmute({}{}(transmute(a))); + assert_eq!(r, e); +"#, + values(in_t[0], &a), + values(out_t, &e), + r_type, + name, + const_value + ) + } + 2 => { + format!( + r#" + let a{}; + let b{}; + let e{}; + let r: {} = transmute({}{}(transmute(a), transmute(b))); + assert_eq!(r, e); +"#, + values(in_t[0], &a), + values(in_t[1], &b), + values(out_t, &e), + r_type, + name, + const_value + ) + } + 3 => { + format!( + r#" + let a{}; + let b{}; + let c{}; + let e{}; + let r: {} = transmute({}{}(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); +"#, + values(in_t[0], &a), + values(in_t[1], &b), + values(in_t[2], &c), + values(out_t, &e), + r_type, + name, + const_value + ) + } + _ => { + panic!("no support para_num:{}", para_num.to_string()) + } + } + }; + + test.push_str(&t); + } + test.push_str(" }\n"); + test +} + +#[allow(clippy::too_many_arguments)] +fn gen_arm( + current_comment: &str, + current_fn: &Option<String>, + current_name: &str, + current_arm: &str, + link_arm: &Option<String>, + current_aarch64: &Option<String>, + link_aarch64: &Option<String>, + const_arm: &Option<String>, + const_aarch64: &Option<String>, + constn: &Option<String>, + in_t: &[&str; 3], + out_t: &str, + current_tests: &[( + Vec<String>, + Vec<String>, + Vec<String>, + Option<String>, + Vec<String>, + )], + suffix: Suffix, + para_num: i32, + target: TargetFeature, + fixed: &Vec<String>, + multi_fn: &Vec<String>, + fn_type: Fntype, + separate: bool, +) -> (String, String) { + let name = match suffix { + Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])), + NoQ => format!("{}{}", current_name, type_to_noq_suffix(in_t[1])), + Double => format!( + "{}{}", + current_name, + type_to_double_suffixes(out_t, in_t[1]) + ), + NoQDouble => format!( + "{}{}", + current_name, + type_to_noq_double_suffixes(out_t, in_t[1]) + ), + NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])), + DoubleN => format!( + "{}{}", + current_name, + type_to_double_n_suffixes(out_t, in_t[1]) + ), + NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), + OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), + OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)), + OutNox => format!( + "{}{}", + current_name, + type_to_suffix(&type_to_sub_type(out_t)) + ), + In1Nox => format!( + "{}{}", + current_name, + type_to_suffix(&type_to_sub_type(in_t[1])) + ), + OutDupNox => format!( + "{}{}", + current_name, + type_to_dup_suffix(&type_to_sub_type(out_t)) + ), + OutLaneNox => format!( + "{}{}", + current_name, + type_to_lane_suffix(&type_to_sub_type(out_t)) + ), + In1LaneNox => format!( + "{}{}", + current_name, + type_to_lane_suffix(&type_to_sub_type(in_t[1])) + ), + Lane => format!( + "{}{}", + current_name, + type_to_lane_suffixes(out_t, in_t[1], false) + ), + In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), + In2Lane => format!( + "{}{}", + current_name, + type_to_lane_suffixes(out_t, in_t[2], false) + ), + OutLane => format!( + "{}{}", + current_name, + type_to_lane_suffixes(out_t, in_t[2], true) + ), + Rot => type_to_rot_suffix(current_name, type_to_suffix(out_t)), + RotLane => type_to_rot_suffix(current_name, &type_to_lane_suffixes(out_t, in_t[2], false)), + }; + let current_aarch64 = current_aarch64 + .clone() + .unwrap_or_else(|| current_arm.to_string()); + let current_target_aarch64 = match target { + Default => "neon", + ArmV7 => "neon", + Vfp4 => "neon", + FPArmV8 => "neon", + AES => "neon,aes", + FCMA => "neon,fcma", + Dotprod => "neon,dotprod", + I8MM => "neon,i8mm", + SHA3 => "neon,sha3", + RDM => "rdm", + SM4 => "neon,sm4", + FTTS => "neon,frintts", + }; + let current_target_arm = match target { + Default => "v7", + ArmV7 => "v7", + Vfp4 => "vfp4", + FPArmV8 => "fp-armv8,v8", + AES => "aes,v8", + FCMA => "v8", // v8.3a + Dotprod => "v8", // v8.2a + I8MM => "v8,i8mm", + RDM => unreachable!(), + SM4 => unreachable!(), + SHA3 => unreachable!(), + FTTS => unreachable!(), + }; + let current_fn = if let Some(current_fn) = current_fn.clone() { + if link_aarch64.is_some() || link_arm.is_some() { + panic!( + "[{}] Can't specify link and function at the same time. {} / {:?} / {:?}", + name, current_fn, link_aarch64, link_arm + ) + } + current_fn + } else if link_aarch64.is_some() || link_arm.is_some() { + format!("{}_", name) + } else { + if multi_fn.is_empty() { + panic!( + "[{}] Either fn or link-arm and link-aarch have to be specified.", + name + ) + } + String::new() + }; + let mut ext_c = String::new(); + let mut ext_c_arm = if multi_fn.is_empty() || link_arm.is_none() { + String::new() + } else { + String::from( + r#" + "#, + ) + }; + let mut ext_c_aarch64 = if multi_fn.is_empty() || link_aarch64.is_none() { + String::new() + } else { + String::from( + r#" + "#, + ) + }; + let mut link_arm_t: Vec<String> = vec![ + in_t[0].to_string(), + in_t[1].to_string(), + in_t[2].to_string(), + out_t.to_string(), + ]; + let mut link_aarch64_t: Vec<String> = vec![ + in_t[0].to_string(), + in_t[1].to_string(), + in_t[2].to_string(), + out_t.to_string(), + ]; + if let (Some(mut link_arm), Some(mut link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) { + if link_arm.contains(":") { + let links: Vec<_> = link_arm.split(':').map(|v| v.to_string()).collect(); + assert_eq!(links.len(), 5); + link_arm = links[0].to_string(); + link_arm_t = vec![ + links[1].clone(), + links[2].clone(), + links[3].clone(), + links[4].clone(), + ]; + } + if link_aarch64.contains(":") { + let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect(); + assert_eq!(links.len(), 5); + link_aarch64 = links[0].to_string(); + link_aarch64_t = vec![ + links[1].clone(), + links[2].clone(), + links[3].clone(), + links[4].clone(), + ]; + } + let link_arm = if link_arm.starts_with("llvm") { + ext(&link_arm, in_t, out_t) + } else { + let mut link = String::from("llvm.arm.neon."); + link.push_str(&link_arm); + ext(&link, in_t, out_t) + }; + let link_aarch64 = if link_aarch64.starts_with("llvm") { + ext(&link_aarch64, in_t, out_t) + } else { + let mut link = String::from("llvm.aarch64.neon."); + link.push_str(&link_aarch64); + ext(&link, in_t, out_t) + }; + if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] { + ext_c = format!( + r#"#[allow(improper_ctypes)] + extern "unadjusted" {{ + #[cfg_attr(target_arch = "arm", link_name = "{}")] + #[cfg_attr(target_arch = "aarch64", link_name = "{}")] + fn {}({}) -> {}; + }} +"#, + link_arm, + link_aarch64, + current_fn, + match para_num { + 1 => format!("a: {}", in_t[0]), + 2 => format!("a: {}, b: {}", in_t[0], in_t[1]), + 3 => format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]), + _ => unimplemented!("unknown para_num"), + }, + out_t + ); + }; + let (arm_ext_inputs, arm_ext_output) = { + if let Some(const_arm) = const_arm { + if !matches!(fn_type, Fntype::Normal) { + let ptr_type = match fn_type { + Fntype::Load => "*const i8", + Fntype::Store => "*mut i8", + _ => panic!("unsupported fn type"), + }; + let sub_type = type_to_sub_type(in_t[1]); + let inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type), + 2 => format!("a: {}, b: {}", sub_type, sub_type,), + 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}", + sub_type, sub_type, sub_type, sub_type, + ), + _ => panic!("unknown type: {}", in_t[1]), + }; + let out = if out_t == "void" { + String::new() + } else { + format!(" -> {}", out_t) + }; + ( + format!("ptr: {}, {}, n: i32, size: i32", ptr_type, inputs), + out, + ) + } else { + let (_, const_type) = if const_arm.contains(":") { + let consts: Vec<_> = + const_arm.split(':').map(|v| v.trim().to_string()).collect(); + (consts[0].clone(), consts[1].clone()) + } else { + ( + const_arm.to_string(), + in_t[para_num as usize - 1].to_string(), + ) + }; + ( + match para_num { + 1 => format!("a: {}, n: {}", in_t[0], const_type), + 2 => format!("a: {}, b: {}, n: {}", in_t[0], in_t[1], const_type), + 3 => format!( + "a: {}, b: {}, c: {}, n: {}", + in_t[0], in_t[1], in_t[2], const_type + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } + } else if out_t != link_arm_t[3] { + ( + match para_num { + 1 => format!("a: {}", link_arm_t[0]), + 2 => format!("a: {}, b: {}", link_arm_t[0], link_arm_t[1]), + 3 => format!( + "a: {}, b: {}, c: {}", + link_arm_t[0], link_arm_t[1], link_arm_t[2] + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", link_arm_t[3]), + ) + } else if matches!(fn_type, Fntype::Store) { + let sub_type = type_to_sub_type(in_t[1]); + let inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type), + 2 => format!("a: {}, b: {}", sub_type, sub_type,), + 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}", + sub_type, sub_type, sub_type, sub_type, + ), + _ => panic!("unknown type: {}", in_t[1]), + }; + let (ptr_type, size) = if is_vstx(&name) { + ("i8".to_string(), ", size: i32") + } else { + (type_to_native_type(in_t[1]), "") + }; + ( + format!("ptr: *mut {}, {}{}", ptr_type, inputs, size), + String::new(), + ) + } else if is_vldx(&name) { + ( + format!("ptr: *const i8, size: i32"), + format!(" -> {}", out_t), + ) + } else { + (String::new(), String::new()) + } + }; + ext_c_arm.push_str(&format!( + r#"#[allow(improper_ctypes)] + extern "unadjusted" {{ + #[cfg_attr(target_arch = "arm", link_name = "{}")] + fn {}({}){}; + }} +"#, + link_arm, current_fn, arm_ext_inputs, arm_ext_output, + )); + let (aarch64_ext_inputs, aarch64_ext_output) = { + if let Some(const_aarch64) = const_aarch64 { + if !matches!(fn_type, Fntype::Normal) { + let ptr_type = match fn_type { + Fntype::Load => "*const i8", + Fntype::Store => "*mut i8", + _ => panic!("unsupported fn type"), + }; + let sub_type = type_to_sub_type(in_t[1]); + let mut inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type,), + 2 => format!("a: {}, b: {}", sub_type, sub_type,), + 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}", + sub_type, sub_type, sub_type, sub_type, + ), + _ => panic!("unknown type: {}", in_t[1]), + }; + inputs.push_str(&format!(", n: i64, ptr: {}", ptr_type)); + let out = if out_t == "void" { + String::new() + } else { + format!(" -> {}", out_t) + }; + (inputs, out) + } else if const_aarch64.contains("dup-in_len-N as ttn") { + ( + match para_num { + 1 => format!("a: {}, n: {}", in_t[0], in_t[0]), + 2 => format!("a: {}, b: {}, n: {}", in_t[0], in_t[1], in_t[1]), + 3 => format!( + "a: {}, b: {}, c: {}, n: {}", + in_t[0], in_t[1], in_t[2], in_t[1] + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } else { + ( + match para_num { + 1 => format!("a: {}, n: i32", in_t[0]), + 2 => format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]), + 3 => format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } + } else if out_t != link_aarch64_t[3] { + ( + match para_num { + 1 => format!("a: {}", link_aarch64_t[0]), + 2 => format!("a: {}, b: {}", link_aarch64_t[0], link_aarch64_t[1]), + 3 => format!( + "a: {}, b: {}, c: {}", + link_aarch64_t[0], link_aarch64_t[1], link_aarch64_t[2] + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", link_aarch64_t[3]), + ) + } else if matches!(fn_type, Fntype::Store) { + let sub_type = type_to_sub_type(in_t[1]); + let mut inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type,), + 2 => format!("a: {}, b: {}", sub_type, sub_type,), + 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}", + sub_type, sub_type, sub_type, sub_type, + ), + _ => panic!("unknown type: {}", in_t[1]), + }; + let ptr_type = if is_vstx(&name) { + "i8".to_string() + } else { + type_to_native_type(in_t[1]) + }; + inputs.push_str(&format!(", ptr: *mut {}", ptr_type)); + (inputs, String::new()) + } else if is_vldx(&name) { + let ptr_type = if name.contains("dup") { + type_to_native_type(out_t) + } else { + type_to_sub_type(out_t) + }; + ( + format!("ptr: *const {}", ptr_type), + format!(" -> {}", out_t), + ) + } else { + (String::new(), String::new()) + } + }; + ext_c_aarch64.push_str(&format!( + r#"#[allow(improper_ctypes)] + extern "unadjusted" {{ + #[cfg_attr(target_arch = "aarch64", link_name = "{}")] + fn {}({}){}; + }} +"#, + link_aarch64, current_fn, aarch64_ext_inputs, aarch64_ext_output, + )); + }; + let const_declare = if let Some(constn) = constn { + format!(r#"<const {}: i32>"#, constn) + } else { + String::new() + }; + let multi_calls = if !multi_fn.is_empty() { + let mut calls = String::new(); + for i in 0..multi_fn.len() { + if i > 0 { + calls.push_str("\n "); + } + calls.push_str(&get_call( + &multi_fn[i], + current_name, + &const_declare, + in_t, + out_t, + fixed, + None, + false, + )); + } + calls + } else { + String::new() + }; + let const_assert = if let Some(constn) = constn { + format!( + r#", {} = {}"#, + constn, + map_val(in_t[1], current_tests[0].3.as_ref().unwrap()) + ) + } else { + String::new() + }; + let const_legacy = if constn.is_some() { + format!("\n#[rustc_legacy_const_generics({})]", para_num) + } else { + String::new() + }; + let fn_decl = { + let fn_output = if out_t == "void" { + String::new() + } else { + format!("-> {} ", out_t) + }; + let fn_inputs = match para_num { + 1 => format!("(a: {})", in_t[0]), + 2 => format!("(a: {}, b: {})", in_t[0], in_t[1]), + 3 => format!("(a: {}, b: {}, c: {})", in_t[0], in_t[1], in_t[2]), + _ => panic!("unsupported parameter number"), + }; + format!( + "pub unsafe fn {}{}{} {}", + name, const_declare, fn_inputs, fn_output + ) + }; + let function = if separate { + let call_arm = { + let arm_params = if let (Some(const_arm), Some(_)) = (const_arm, link_arm) { + if !matches!(fn_type, Fntype::Normal) { + let subs = match type_sub_len(in_t[1]) { + 1 => "b", + 2 => "b.0, b.1", + 3 => "b.0, b.1, b.2", + 4 => "b.0, b.1, b.2, b.3", + _ => "", + }; + format!( + "{}(a as _, {}, {}, {})", + current_fn, + subs, + constn.as_deref().unwrap(), + type_bits(&type_to_sub_type(in_t[1])) / 8, + ) + } else { + let cnt = if const_arm.contains(':') { + let consts: Vec<_> = + const_arm.split(':').map(|v| v.trim().to_string()).collect(); + consts[0].clone() + } else { + let const_arm = const_arm.replace("ttn", &type_to_native_type(in_t[1])); + let mut cnt = String::from(in_t[1]); + cnt.push_str("("); + for i in 0..type_len(in_t[1]) { + if i != 0 { + cnt.push_str(", "); + } + cnt.push_str(&const_arm); + } + cnt.push_str(")"); + cnt + }; + match para_num { + 1 => format!("{}(a, {})", current_fn, cnt), + 2 => format!("{}(a, b, {})", current_fn, cnt), + _ => String::new(), + } + } + } else if out_t != link_arm_t[3] { + match para_num { + 1 => format!("transmute({}(a))", current_fn,), + 2 => format!("transmute({}(transmute(a), transmute(b)))", current_fn,), + _ => String::new(), + } + } else if matches!(fn_type, Fntype::Store) { + let (cast, size) = if is_vstx(&name) { + ( + " as _", + format!(", {}", type_bits(&type_to_sub_type(in_t[1])) / 8), + ) + } else { + ("", String::new()) + }; + match type_sub_len(in_t[1]) { + 1 => format!("{}(a{}, b{})", current_fn, cast, size), + 2 => format!("{}(a{}, b.0, b.1{})", current_fn, cast, size), + 3 => format!("{}(a{}, b.0, b.1, b.2{})", current_fn, cast, size), + 4 => format!("{}(a{}, b.0, b.1, b.2, b.3{})", current_fn, cast, size), + _ => String::new(), + } + } else if link_arm.is_some() && is_vldx(&name) { + format!( + "{}(a as *const i8, {})", + current_fn, + type_bits(&type_to_sub_type(out_t)) / 8 + ) + } else { + String::new() + }; + format!( + r#"{}{{ + {}{}{} +}}"#, + fn_decl, multi_calls, ext_c_arm, arm_params + ) + }; + let call_aarch64 = { + let aarch64_params = + if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) { + if !matches!(fn_type, Fntype::Normal) { + let subs = match type_sub_len(in_t[1]) { + 1 => "b", + 2 => "b.0, b.1", + 3 => "b.0, b.1, b.2", + 4 => "b.0, b.1, b.2, b.3", + _ => "", + }; + format!( + "{}({}, {} as i64, a as _)", + current_fn, + subs, + constn.as_deref().unwrap() + ) + } else if const_aarch64.contains("dup-in_len-N as ttn") { + let const_aarch64 = format!("N as {}", type_to_native_type(in_t[1])); + let mut cnt = String::from(in_t[1]); + cnt.push_str("("); + for i in 0..type_len(in_t[1]) { + if i != 0 { + cnt.push_str(", "); + } + cnt.push_str(&const_aarch64); + } + cnt.push_str(")"); + format!("{}(a, {})", current_fn, cnt) + } else { + match para_num { + 1 => format!("{}(a, {})", current_fn, const_aarch64), + 2 => format!("{}(a, b, {})", current_fn, const_aarch64), + _ => String::new(), + } + } + } else if out_t != link_aarch64_t[3] { + match para_num { + 1 => format!("transmute({}(a))", current_fn,), + 2 => format!("transmute({}(a, b))", current_fn,), + _ => String::new(), + } + } else if matches!(fn_type, Fntype::Store) { + let cast = if is_vstx(&name) { " as _" } else { "" }; + match type_sub_len(in_t[1]) { + 1 => format!("{}(b, a{})", current_fn, cast), + 2 => format!("{}(b.0, b.1, a{})", current_fn, cast), + 3 => format!("{}(b.0, b.1, b.2, a{})", current_fn, cast), + 4 => format!("{}(b.0, b.1, b.2, b.3, a{})", current_fn, cast), + _ => String::new(), + } + } else if link_aarch64.is_some() && is_vldx(&name) { + format!("{}(a as _)", current_fn) + } else { + String::new() + }; + format!( + r#"{}{{ + {}{}{} +}}"#, + fn_decl, multi_calls, ext_c_aarch64, aarch64_params + ) + }; + let stable_aarch64 = match target { + Default | ArmV7 | Vfp4 | FPArmV8 | AES => { + String::from("\n#[stable(feature = \"neon_intrinsics\", since = \"1.59.0\")]") + } + RDM => String::from("\n#[stable(feature = \"rdm_intrinsics\", since = \"1.62.0\")]"), + _ => String::new(), + }; + format!( + r#" +{} +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,{}")] +#[cfg_attr(test, assert_instr({}{}))]{} +{} + +{} +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "{}")] +#[cfg_attr(test, assert_instr({}{}))]{}{} +{} +"#, + current_comment, + current_target_arm, + expand_intrinsic(¤t_arm, in_t[1]), + const_assert, + const_legacy, + call_arm, + current_comment, + current_target_aarch64, + expand_intrinsic(¤t_aarch64, in_t[1]), + const_assert, + const_legacy, + stable_aarch64, + call_aarch64, + ) + } else { + let call = { + let stmts = match (multi_calls.len(), para_num, fixed.len()) { + (0, 1, 0) => format!(r#"{}{}(a)"#, ext_c, current_fn,), + (0, 1, _) => { + let fixed: Vec<String> = + fixed.iter().take(type_len(in_t[0])).cloned().collect(); + format!( + r#"let b{}; + {}{}(a, transmute(b))"#, + values(in_t[0], &fixed), + ext_c, + current_fn, + ) + } + (0, 2, _) => format!(r#"{}{}(a, b)"#, ext_c, current_fn,), + (0, 3, _) => format!(r#"{}{}(a, b, c)"#, ext_c, current_fn,), + (_, 1, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 2, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 3, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, _, _) => String::new(), + }; + if stmts != String::new() { + format!( + r#"{}{{ + {} +}}"#, + fn_decl, stmts + ) + } else { + String::new() + } + }; + let stable_aarch64 = match target { + Default | ArmV7 | Vfp4 | FPArmV8 | AES => String::from("\n#[cfg_attr(target_arch = \"aarch64\", stable(feature = \"neon_intrinsics\", since = \"1.59.0\"))]"), + RDM => String::from("\n#[cfg_attr(target_arch = \"aarch64\", stable(feature = \"rdm_intrinsics\", since = \"1.62.0\"))]"), + _ => String::new(), + }; + format!( + r#" +{} +#[inline] +#[target_feature(enable = "{}")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "{}"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr({}{}))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({}{}))]{}{} +{} +"#, + current_comment, + current_target_aarch64, + current_target_arm, + expand_intrinsic(¤t_arm, in_t[1]), + const_assert, + expand_intrinsic(¤t_aarch64, in_t[1]), + const_assert, + const_legacy, + stable_aarch64, + call, + ) + }; + let test_target = match target { + I8MM => "neon,i8mm", + SM4 => "neon,sm4", + SHA3 => "neon,sha3", + FTTS => "neon,frintts", + _ => "neon", + }; + let test = match fn_type { + Fntype::Normal => gen_test( + &name, + in_t, + &out_t, + current_tests, + [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])], + type_len(out_t), + para_num, + test_target, + ), + Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)), + Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])), + }; + (function, test) +} + +fn expand_intrinsic(intr: &str, t: &str) -> String { + if intr.ends_with('.') { + let ext = match t { + "int8x8_t" => "i8", + "int8x16_t" => "i8", + "int16x4_t" => "i16", + "int16x8_t" => "i16", + "int32x2_t" => "i32", + "int32x4_t" => "i32", + "int64x1_t" => "i64", + "int64x2_t" => "i64", + "uint8x8_t" => "i8", + "uint8x16_t" => "i8", + "uint16x4_t" => "i16", + "uint16x8_t" => "i16", + "uint32x2_t" => "i32", + "uint32x4_t" => "i32", + "uint64x1_t" => "i64", + "uint64x2_t" => "i64", + "float16x4_t" => "f16", + "float16x8_t" => "f16", + "float32x2_t" => "f32", + "float32x4_t" => "f32", + "float64x1_t" => "f64", + "float64x2_t" => "f64", + "poly8x8_t" => "i8", + "poly8x16_t" => "i8", + "poly16x4_t" => "i16", + "poly16x8_t" => "i16", + /* + "poly64x1_t" => "i64x1", + "poly64x2_t" => "i64x2", + */ + _ => panic!("unknown type for extension: {}", t), + }; + format!(r#""{}{}""#, intr, ext) + } else if intr.ends_with(".s") { + let ext = match t { + "int8x8_t" => "s8", + "int8x16_t" => "s8", + "int16x4_t" => "s16", + "int16x8_t" => "s16", + "int32x2_t" => "s32", + "int32x4_t" => "s32", + "int64x1_t" => "s64", + "int64x2_t" => "s64", + "uint8x8_t" => "u8", + "uint8x16_t" => "u8", + "uint16x4_t" => "u16", + "uint16x8_t" => "u16", + "uint32x2_t" => "u32", + "uint32x4_t" => "u32", + "uint64x1_t" => "u64", + "uint64x2_t" => "u64", + "poly8x8_t" => "p8", + "poly8x16_t" => "p8", + "poly16x4_t" => "p16", + "poly16x8_t" => "p16", + "float16x4_t" => "f16", + "float16x8_t" => "f16", + "float32x2_t" => "f32", + "float32x4_t" => "f32", + "float64x1_t" => "f64", + "float64x2_t" => "f64", + /* + "poly64x1_t" => "i64x1", + "poly64x2_t" => "i64x2", + */ + _ => panic!("unknown type for extension: {}", t), + }; + format!(r#""{}{}""#, &intr[..intr.len() - 1], ext) + } else if intr.ends_with(".l") { + let ext = match t { + "int8x8_t" => "8", + "int8x16_t" => "8", + "int16x4_t" => "16", + "int16x8_t" => "16", + "int32x2_t" => "32", + "int32x4_t" => "32", + "int64x1_t" => "64", + "int64x2_t" => "64", + "uint8x8_t" => "8", + "uint8x16_t" => "8", + "uint16x4_t" => "16", + "uint16x8_t" => "16", + "uint32x2_t" => "32", + "uint32x4_t" => "32", + "uint64x1_t" => "64", + "uint64x2_t" => "64", + "poly8x8_t" => "8", + "poly8x16_t" => "8", + "poly16x4_t" => "16", + "poly16x8_t" => "16", + "float16x4_t" => "16", + "float16x8_t" => "16", + "float32x2_t" => "32", + "float32x4_t" => "32", + "float64x1_t" => "64", + "float64x2_t" => "64", + "poly64x1_t" => "64", + "poly64x2_t" => "64", + _ => panic!("unknown type for extension: {}", t), + }; + format!(r#""{}{}""#, &intr[..intr.len() - 1], ext) + } else { + intr.to_string() + } +} + +fn get_call( + in_str: &str, + current_name: &str, + const_declare: &str, + in_t: &[&str; 3], + out_t: &str, + fixed: &Vec<String>, + n: Option<i32>, + aarch64: bool, +) -> String { + let params: Vec<_> = in_str.split(',').map(|v| v.trim().to_string()).collect(); + assert!(params.len() > 0); + let mut fn_name = params[0].clone(); + if fn_name == "a" { + return String::from("a"); + } + if fn_name == "transpose-1-in_len" { + return transpose1(type_len(in_t[1])).to_string(); + } + if fn_name == "transpose-2-in_len" { + return transpose2(type_len(in_t[1])).to_string(); + } + if fn_name == "zip-1-in_len" { + return zip1(type_len(in_t[1])).to_string(); + } + if fn_name == "zip-2-in_len" { + return zip2(type_len(in_t[1])).to_string(); + } + if fn_name == "unzip-1-in_len" { + return unzip1(type_len(in_t[1])).to_string(); + } + if fn_name == "unzip-2-in_len" { + return unzip2(type_len(in_t[1])).to_string(); + } + if fn_name.starts_with("dup") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let len = match &*fn_format[1] { + "out_len" => type_len(out_t), + "in_len" => type_len(in_t[1]), + "in0_len" => type_len(in_t[0]), + "halflen" => type_len(in_t[1]) / 2, + _ => 0, + }; + let mut s = format!("{} [", const_declare); + for i in 0..len { + if i != 0 { + s.push_str(", "); + } + s.push_str(&fn_format[2]); + } + s.push_str("]"); + return s; + } + if fn_name.starts_with("asc") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let start = match &*fn_format[1] { + "0" => 0, + "n" => n.unwrap(), + "out_len" => type_len(out_t) as i32, + "halflen" => (type_len(in_t[1]) / 2) as i32, + s => s.parse::<i32>().unwrap(), + }; + let len = match &*fn_format[2] { + "out_len" => type_len(out_t), + "in_len" => type_len(in_t[1]), + "in0_len" => type_len(in_t[0]), + "halflen" => type_len(in_t[1]) / 2, + _ => 0, + }; + return asc(start, len); + } + if fn_name.starts_with("base") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + assert_eq!(fn_format.len(), 3); + let mut s = format!("<const {}: i32> [", &fn_format[2]); + let base_len = fn_format[1].parse::<usize>().unwrap(); + for i in 0..type_len(in_t[1]) / base_len { + for j in 0..base_len { + if i != 0 || j != 0 { + s.push_str(", "); + } + s.push_str(&format!("{} * {} as u32", base_len, &fn_format[2])); + if j != 0 { + s.push_str(&format!(" + {}", j)); + } + } + } + s.push_str("]"); + return s; + } + if fn_name.starts_with("as") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + assert_eq!(fn_format.len(), 3); + let t = match &*fn_format[2] { + "in_ttn" => type_to_native_type(in_t[1]), + _ => String::new(), + }; + return format!("{} as {}", &fn_format[1], t); + } + if fn_name.starts_with("ins") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let n = n.unwrap(); + let len = match &*fn_format[1] { + "out_len" => type_len(out_t), + "in_len" => type_len(in_t[1]), + "in0_len" => type_len(in_t[0]), + _ => 0, + }; + let offset = match &*fn_format[2] { + "out_len" => type_len(out_t), + "in_len" => type_len(in_t[1]), + "in0_len" => type_len(in_t[0]), + _ => 0, + }; + let mut s = format!("{} [", const_declare); + for i in 0..len { + if i != 0 { + s.push_str(", "); + } + if i == n as usize { + s.push_str(&format!("{} + {} as u32", offset.to_string(), fn_format[3])); + } else { + s.push_str(&i.to_string()); + } + } + s.push_str("]"); + return s; + } + if fn_name.starts_with("static_assert_imm") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let len = match &*fn_format[1] { + "out_exp_len" => type_exp_len(out_t, 1), + "out_bits_exp_len" => type_bits_exp_len(out_t), + "in_exp_len" => type_exp_len(in_t[1], 1), + "in_bits_exp_len" => type_bits_exp_len(in_t[1]), + "in0_exp_len" => type_exp_len(in_t[0], 1), + "in1_exp_len" => type_exp_len(in_t[1], 1), + "in2_exp_len" => type_exp_len(in_t[2], 1), + "in2_rot" => type_exp_len(in_t[2], 2), + "in2_dot" => type_exp_len(in_t[2], 4), + _ => 0, + }; + if len == 0 { + return format!( + r#"static_assert!({} : i32 where {} == 0);"#, + fn_format[2], fn_format[2] + ); + } else { + return format!(r#"static_assert_imm{}!({});"#, len, fn_format[2]); + } + } + if fn_name.starts_with("static_assert") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let lim1 = if fn_format[2] == "bits" { + type_bits(in_t[1]).to_string() + } else if fn_format[2] == "halfbits" { + (type_bits(in_t[1]) / 2).to_string() + } else { + fn_format[2].clone() + }; + let lim2 = if fn_format[3] == "bits" { + type_bits(in_t[1]).to_string() + } else if fn_format[3] == "halfbits" { + (type_bits(in_t[1]) / 2).to_string() + } else { + fn_format[3].clone() + }; + if lim1 == lim2 { + return format!( + r#"static_assert!({} : i32 where {} == {});"#, + fn_format[1], fn_format[1], lim1 + ); + } else { + return format!( + r#"static_assert!({} : i32 where {} >= {} && {} <= {});"#, + fn_format[1], fn_format[1], lim1, fn_format[1], lim2 + ); + } + } + if fn_name.starts_with("fix_right_shift_imm") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let lim = if fn_format[2] == "bits" { + type_bits(in_t[1]).to_string() + } else { + fn_format[2].clone() + }; + let fixed = if in_t[1].starts_with('u') { + format!("return vdup{nself}(0);", nself = type_to_n_suffix(in_t[1])) + } else { + (lim.parse::<i32>().unwrap() - 1).to_string() + }; + + return format!( + r#"let {name}: i32 = if {const_name} == {upper} {{ {fixed} }} else {{ N }};"#, + name = fn_format[1].to_lowercase(), + const_name = fn_format[1], + upper = lim, + fixed = fixed, + ); + } + + if fn_name.starts_with("matchn") { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + let len = match &*fn_format[1] { + "out_exp_len" => type_exp_len(out_t, 1), + "in_exp_len" => type_exp_len(in_t[1], 1), + "in0_exp_len" => type_exp_len(in_t[0], 1), + _ => 0, + }; + let mut call = format!("match {} & 0b{} {{\n", &fn_format[2], "1".repeat(len)); + let mut sub_call = String::new(); + for p in 1..params.len() { + if !sub_call.is_empty() { + sub_call.push_str(", "); + } + sub_call.push_str(¶ms[p]); + } + for i in 0..(2u32.pow(len as u32) as usize) { + let sub_match = format!( + " {} => {},\n", + i, + get_call( + &sub_call, + current_name, + const_declare, + in_t, + out_t, + fixed, + Some(i as i32), + aarch64 + ) + ); + call.push_str(&sub_match); + } + call.push_str(" _ => unreachable_unchecked(),\n }"); + return call; + } + let mut re: Option<(String, String)> = None; + let mut param_str = String::new(); + let mut i = 1; + while i < params.len() { + let s = ¶ms[i]; + if s.starts_with('{') { + let mut sub_fn = String::new(); + let mut parentheses = 0; + while i < params.len() { + if !sub_fn.is_empty() { + sub_fn.push_str(", "); + } + sub_fn.push_str(¶ms[i]); + let l = params[i].len(); + for j in 0..l { + if ¶ms[i][j..j + 1] == "{" { + parentheses += 1; + } else { + break; + } + } + for j in 0..l { + if ¶ms[i][l - j - 1..l - j] == "}" { + parentheses -= 1; + } else { + break; + } + } + if parentheses == 0 { + break; + } + i += 1; + } + let sub_call = get_call( + &sub_fn[1..sub_fn.len() - 1], + current_name, + const_declare, + in_t, + out_t, + fixed, + n.clone(), + aarch64, + ); + if !param_str.is_empty() { + param_str.push_str(", "); + } + param_str.push_str(&sub_call); + } else if s.contains(':') { + let re_params: Vec<_> = s.split(':').map(|v| v.to_string()).collect(); + if re_params[1] == "" { + re = Some((re_params[0].clone(), in_t[1].to_string())); + } else if re_params[1] == "in_t" { + re = Some((re_params[0].clone(), in_t[1].to_string())); + } else if re_params[1] == "signed" { + re = Some((re_params[0].clone(), type_to_signed(in_t[1]))); + } else if re_params[1] == "unsigned" { + re = Some((re_params[0].clone(), type_to_unsigned(in_t[1]))); + } else if re_params[1] == "in_t0" { + re = Some((re_params[0].clone(), in_t[0].to_string())); + } else if re_params[1] == "in_t1" { + re = Some((re_params[0].clone(), in_t[1].to_string())); + } else if re_params[1] == "out_t" { + re = Some((re_params[0].clone(), out_t.to_string())); + } else if re_params[1] == "half" { + re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string())); + } else if re_params[1] == "in_ntt" { + re = Some(( + re_params[0].clone(), + native_type_to_type(in_t[1]).to_string(), + )); + } else if re_params[1] == "in_long_ntt" { + re = Some(( + re_params[0].clone(), + native_type_to_long_type(in_t[1]).to_string(), + )); + } else if re_params[1] == "out_ntt" { + re = Some((re_params[0].clone(), native_type_to_type(out_t).to_string())); + } else if re_params[1] == "out_long_ntt" { + re = Some(( + re_params[0].clone(), + native_type_to_long_type(out_t).to_string(), + )); + } else { + re = Some((re_params[0].clone(), re_params[1].clone())); + } + } else { + if !param_str.is_empty() { + param_str.push_str(", "); + } + param_str.push_str(s); + } + i += 1; + } + if fn_name == "fixed" { + let (re_name, re_type) = re.unwrap(); + let fixed: Vec<String> = fixed.iter().take(type_len(in_t[1])).cloned().collect(); + return format!(r#"let {}{};"#, re_name, values(&re_type, &fixed)); + } + if fn_name == "fixed-half-right" { + let fixed: Vec<String> = fixed.iter().take(type_len(in_t[1])).cloned().collect(); + let half = fixed[type_len(in_t[1]) / 2..] + .iter() + .fold(String::new(), |mut s, fix| { + s.push_str(fix); + s.push_str(", "); + s + }); + return format!(r#"[{}]"#, &half[..half.len() - 2]); + } + if fn_name == "a - b" { + return fn_name; + } + if fn_name == "-a" { + return fn_name; + } + if fn_name.contains('-') { + let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); + assert_eq!(fn_format.len(), 3); + fn_name = if fn_format[0] == "self" { + current_name.to_string() + } else { + fn_format[0].clone() + }; + if fn_format[1] == "self" { + fn_name.push_str(type_to_suffix(in_t[1])); + } else if fn_format[1] == "nself" { + fn_name.push_str(type_to_n_suffix(in_t[1])); + } else if fn_format[1] == "nselfvfp4" { + fn_name.push_str(type_to_n_suffix(in_t[1])); + if !aarch64 { + fn_name.push_str("_vfp4"); + } + } else if fn_format[1] == "out" { + fn_name.push_str(type_to_suffix(out_t)); + } else if fn_format[1] == "in0" { + fn_name.push_str(type_to_suffix(in_t[0])); + } else if fn_format[1] == "in2" { + fn_name.push_str(type_to_suffix(in_t[2])); + } else if fn_format[1] == "in2lane" { + fn_name.push_str(&type_to_lane_suffixes(out_t, in_t[2], false)); + } else if fn_format[1] == "outlane" { + fn_name.push_str(&type_to_lane_suffixes(out_t, in_t[2], true)); + } else if fn_format[1] == "signed" { + fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(in_t[1])))); + } else if fn_format[1] == "outsigned" { + fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(out_t)))); + } else if fn_format[1] == "outsignednox" { + fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed( + &String::from(out_t), + )))); + } else if fn_format[1] == "in1signednox" { + fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed( + &String::from(in_t[1]), + )))); + } else if fn_format[1] == "outsigneddupnox" { + fn_name.push_str(&type_to_dup_suffix(&type_to_sub_type(&type_to_signed( + &String::from(out_t), + )))); + } else if fn_format[1] == "outsignedlanenox" { + fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed( + &String::from(out_t), + )))); + } else if fn_format[1] == "in1signedlanenox" { + fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed( + &String::from(in_t[1]), + )))); + } else if fn_format[1] == "unsigned" { + fn_name.push_str(type_to_suffix(&type_to_unsigned(in_t[1]))); + } else if fn_format[1] == "doubleself" { + fn_name.push_str(&type_to_double_suffixes(out_t, in_t[1])); + } else if fn_format[1] == "noq_doubleself" { + fn_name.push_str(&type_to_noq_double_suffixes(out_t, in_t[1])); + } else if fn_format[1] == "noqself" { + fn_name.push_str(type_to_noq_suffix(in_t[1])); + } else if fn_format[1] == "noqsigned" { + fn_name.push_str(type_to_noq_suffix(&type_to_signed(&String::from(in_t[1])))); + } else if fn_format[1] == "nosuffix" { + } else if fn_format[1] == "in_len" { + fn_name.push_str(&type_len(in_t[1]).to_string()); + } else if fn_format[1] == "in0_len" { + fn_name.push_str(&type_len(in_t[0]).to_string()); + } else if fn_format[1] == "out_len" { + fn_name.push_str(&type_len(out_t).to_string()); + } else if fn_format[1] == "halflen" { + fn_name.push_str(&(type_len(in_t[1]) / 2).to_string()); + } else if fn_format[1] == "nout" { + fn_name.push_str(type_to_n_suffix(out_t)); + } else if fn_format[1] == "nin0" { + fn_name.push_str(type_to_n_suffix(in_t[0])); + } else if fn_format[1] == "nsigned" { + fn_name.push_str(type_to_n_suffix(&type_to_signed(&String::from(in_t[1])))); + } else if fn_format[1] == "in_ntt" { + fn_name.push_str(type_to_suffix(native_type_to_type(in_t[1]))); + } else if fn_format[1] == "out_ntt" { + fn_name.push_str(type_to_suffix(native_type_to_type(out_t))); + } else if fn_format[1] == "rot" { + fn_name = type_to_rot_suffix(&fn_name, type_to_suffix(out_t)); + } else { + fn_name.push_str(&fn_format[1]); + }; + if fn_format[2] == "ext" { + fn_name.push_str("_"); + } else if fn_format[2] == "noext" { + } else if fn_format[2].starts_with("<") { + assert!(fn_format[2].ends_with(">")); + let types: Vec<_> = fn_format[2][1..fn_format[2].len() - 1] + .split(' ') + .map(|v| v.to_string()) + .collect(); + assert_eq!(types.len(), 2); + let type1 = if types[0] == "element_t" { + type_to_native_type(in_t[1]) + } else { + String::from(&types[0]) + }; + let type2 = if types[1] == "element_t" { + type_to_native_type(in_t[1]) + } else { + String::from(&types[1]) + }; + fn_name.push_str(&format!("::<{}, {}>", &type1, &type2)); + } else { + fn_name.push_str(&fn_format[2]); + } + } + if param_str.is_empty() { + return fn_name.replace("out_t", out_t); + } + let fn_str = if let Some((re_name, re_type)) = re.clone() { + format!( + r#"let {}: {} = {}({});"#, + re_name, re_type, fn_name, param_str + ) + } else if fn_name.starts_with("*") { + format!(r#"{} = {};"#, fn_name, param_str) + } else { + format!(r#"{}({})"#, fn_name, param_str) + }; + return fn_str; +} + +fn main() -> io::Result<()> { + let args: Vec<String> = env::args().collect(); + let in_file = args.get(1).cloned().unwrap_or_else(|| IN.to_string()); + + let f = File::open(in_file).expect("Failed to open neon.spec"); + let f = BufReader::new(f); + + let mut current_comment = String::new(); + let mut current_name: Option<String> = None; + let mut current_fn: Option<String> = None; + let mut current_arm: Option<String> = None; + let mut current_aarch64: Option<String> = None; + let mut link_arm: Option<String> = None; + let mut link_aarch64: Option<String> = None; + let mut const_arm: Option<String> = None; + let mut const_aarch64: Option<String> = None; + let mut constn: Option<String> = None; + let mut para_num = 2; + let mut suffix: Suffix = Normal; + let mut a: Vec<String> = Vec::new(); + let mut b: Vec<String> = Vec::new(); + let mut c: Vec<String> = Vec::new(); + let mut n: Option<String> = None; + let mut fixed: Vec<String> = Vec::new(); + let mut current_tests: Vec<( + Vec<String>, + Vec<String>, + Vec<String>, + Option<String>, + Vec<String>, + )> = Vec::new(); + let mut multi_fn: Vec<String> = Vec::new(); + let mut target: TargetFeature = Default; + let mut fn_type: Fntype = Fntype::Normal; + let mut separate = false; + + // + // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY + // + let mut out_arm = String::from( + r#"// This code is automatically generated. DO NOT MODIFY. +// +// Instead, modify `crates/stdarch-gen/neon.spec` and run the following command to re-generate this file: +// +// ``` +// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec +// ``` +use super::*; +#[cfg(test)] +use stdarch_test::assert_instr; +"#, + ); + let mut tests_arm = String::from( + r#" +#[cfg(test)] +#[allow(overflowing_literals)] +mod test { + use super::*; + use crate::core_arch::simd::*; + use std::mem::transmute; + use stdarch_test::simd_test; +"#, + ); + // + // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY + // + let mut out_aarch64 = String::from( + r#"// This code is automatically generated. DO NOT MODIFY. +// +// Instead, modify `crates/stdarch-gen/neon.spec` and run the following command to re-generate this file: +// +// ``` +// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec +// ``` +use super::*; +#[cfg(test)] +use stdarch_test::assert_instr; +"#, + ); + let mut tests_aarch64 = String::from( + r#" +#[cfg(test)] +mod test { + use super::*; + use crate::core_arch::simd::*; + use std::mem::transmute; + use stdarch_test::simd_test; +"#, + ); + + for line in f.lines() { + let line = line.unwrap(); + if line.is_empty() { + continue; + } + if line.starts_with("/// ") { + current_comment = line; + current_name = None; + current_fn = None; + current_arm = None; + current_aarch64 = None; + link_aarch64 = None; + link_arm = None; + const_aarch64 = None; + const_arm = None; + current_tests = Vec::new(); + constn = None; + para_num = 2; + suffix = Normal; + a = Vec::new(); + b = Vec::new(); + c = Vec::new(); + fixed = Vec::new(); + n = None; + multi_fn = Vec::new(); + target = Default; + fn_type = Fntype::Normal; + separate = false; + } else if line.starts_with("//") { + } else if line.starts_with("name = ") { + current_name = Some(String::from(&line[7..])); + } else if line.starts_with("fn = ") { + current_fn = Some(String::from(&line[5..])); + } else if line.starts_with("multi_fn = ") { + multi_fn.push(String::from(&line[11..])); + } else if line.starts_with("constn = ") { + constn = Some(String::from(&line[9..])); + } else if line.starts_with("arm = ") { + current_arm = Some(String::from(&line[6..])); + } else if line.starts_with("aarch64 = ") { + current_aarch64 = Some(String::from(&line[10..])); + } else if line.starts_with("double-suffixes") { + suffix = Double; + } else if line.starts_with("no-q") { + suffix = NoQ; + } else if line.starts_with("noq-double-suffixes") { + suffix = NoQDouble; + } else if line.starts_with("n-suffix") { + suffix = NSuffix; + } else if line.starts_with("double-n-suffixes") { + suffix = DoubleN; + } else if line.starts_with("out-n-suffix") { + suffix = OutNSuffix; + } else if line.starts_with("noq-n-suffix") { + suffix = NoQNSuffix; + } else if line.starts_with("out-suffix") { + suffix = OutSuffix; + } else if line.starts_with("out-nox") { + suffix = OutNox; + } else if line.starts_with("in1-nox") { + suffix = In1Nox; + } else if line.starts_with("out-dup-nox") { + suffix = OutDupNox; + } else if line.starts_with("out-lane-nox") { + suffix = OutLaneNox; + } else if line.starts_with("in1-lane-nox") { + suffix = In1LaneNox; + } else if line.starts_with("lane-suffixes") { + suffix = Lane; + } else if line.starts_with("in2-suffix") { + suffix = In2; + } else if line.starts_with("in2-lane-suffixes") { + suffix = In2Lane; + } else if line.starts_with("out-lane-suffixes") { + suffix = OutLane; + } else if line.starts_with("rot-suffix") { + suffix = Rot; + } else if line.starts_with("rot-lane-suffixes") { + suffix = RotLane; + } else if line.starts_with("a = ") { + a = line[4..].split(',').map(|v| v.trim().to_string()).collect(); + } else if line.starts_with("b = ") { + b = line[4..].split(',').map(|v| v.trim().to_string()).collect(); + } else if line.starts_with("c = ") { + c = line[4..].split(',').map(|v| v.trim().to_string()).collect(); + } else if line.starts_with("n = ") { + n = Some(String::from(&line[4..])); + } else if line.starts_with("fixed = ") { + fixed = line[8..].split(',').map(|v| v.trim().to_string()).collect(); + } else if line.starts_with("validate ") { + let e = line[9..].split(',').map(|v| v.trim().to_string()).collect(); + current_tests.push((a.clone(), b.clone(), c.clone(), n.clone(), e)); + } else if line.starts_with("link-aarch64 = ") { + link_aarch64 = Some(String::from(&line[15..])); + } else if line.starts_with("const-aarch64 = ") { + const_aarch64 = Some(String::from(&line[16..])); + } else if line.starts_with("link-arm = ") { + link_arm = Some(String::from(&line[11..])); + } else if line.starts_with("const-arm = ") { + const_arm = Some(String::from(&line[12..])); + } else if line.starts_with("load_fn") { + fn_type = Fntype::Load; + } else if line.starts_with("store_fn") { + fn_type = Fntype::Store; + } else if line.starts_with("arm-aarch64-separate") { + separate = true; + } else if line.starts_with("target = ") { + target = match Some(String::from(&line[9..])) { + Some(input) => match input.as_str() { + "v7" => ArmV7, + "vfp4" => Vfp4, + "fp-armv8" => FPArmV8, + "aes" => AES, + "fcma" => FCMA, + "dotprod" => Dotprod, + "i8mm" => I8MM, + "sha3" => SHA3, + "rdm" => RDM, + "sm4" => SM4, + "frintts" => FTTS, + _ => Default, + }, + _ => Default, + } + } else if line.starts_with("generate ") { + let line = &line[9..]; + let types: Vec<String> = line + .split(',') + .map(|v| v.trim().to_string()) + .flat_map(|v| match v.as_str() { + "uint*_t" => UINT_TYPES.iter().map(|v| v.to_string()).collect(), + "uint64x*_t" => UINT_TYPES_64.iter().map(|v| v.to_string()).collect(), + "int*_t" => INT_TYPES.iter().map(|v| v.to_string()).collect(), + "int64x*_t" => INT_TYPES_64.iter().map(|v| v.to_string()).collect(), + "float*_t" => FLOAT_TYPES.iter().map(|v| v.to_string()).collect(), + "float64x*_t" => FLOAT_TYPES_64.iter().map(|v| v.to_string()).collect(), + _ => vec![v], + }) + .collect(); + + for line in types { + let spec: Vec<&str> = line.split(':').map(|e| e.trim()).collect(); + let in_t: [&str; 3]; + let out_t; + if spec.len() == 1 { + in_t = [spec[0], spec[0], spec[0]]; + out_t = spec[0]; + } else if spec.len() == 2 { + in_t = [spec[0], spec[0], spec[0]]; + out_t = spec[1]; + } else if spec.len() == 3 { + in_t = [spec[0], spec[1], spec[1]]; + out_t = spec[2]; + } else if spec.len() == 4 { + in_t = [spec[0], spec[1], spec[2]]; + out_t = spec[3]; + } else { + panic!("Bad spec: {}", line) + } + if b.len() == 0 { + if matches!(fn_type, Fntype::Store) { + para_num = 2; + } else { + para_num = 1; + } + } else if c.len() != 0 { + para_num = 3; + } + let current_name = current_name.clone().unwrap(); + if let Some(current_arm) = current_arm.clone() { + let (function, test) = gen_arm( + ¤t_comment, + ¤t_fn, + ¤t_name, + ¤t_arm, + &link_arm, + ¤t_aarch64, + &link_aarch64, + &const_arm, + &const_aarch64, + &constn, + &in_t, + &out_t, + ¤t_tests, + suffix, + para_num, + target, + &fixed, + &multi_fn, + fn_type, + separate, + ); + out_arm.push_str(&function); + tests_arm.push_str(&test); + } else { + let (function, test) = gen_aarch64( + ¤t_comment, + ¤t_fn, + ¤t_name, + ¤t_aarch64, + &link_aarch64, + &const_aarch64, + &constn, + &in_t, + &out_t, + ¤t_tests, + suffix, + para_num, + target, + &fixed, + &multi_fn, + fn_type, + ); + out_aarch64.push_str(&function); + tests_aarch64.push_str(&test); + } + } + } + } + tests_arm.push('}'); + tests_arm.push('\n'); + tests_aarch64.push('}'); + tests_aarch64.push('\n'); + + let arm_out_path: PathBuf = + PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string())) + .join("src") + .join("arm_shared") + .join("neon"); + std::fs::create_dir_all(&arm_out_path)?; + + let mut file_arm = File::create(arm_out_path.join(ARM_OUT))?; + file_arm.write_all(out_arm.as_bytes())?; + file_arm.write_all(tests_arm.as_bytes())?; + + let aarch64_out_path: PathBuf = + PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string())) + .join("src") + .join("aarch64") + .join("neon"); + std::fs::create_dir_all(&aarch64_out_path)?; + + let mut file_aarch = File::create(aarch64_out_path.join(AARCH64_OUT))?; + file_aarch.write_all(out_aarch64.as_bytes())?; + file_aarch.write_all(tests_aarch64.as_bytes())?; + /* + if let Err(e) = Command::new("rustfmt") + .arg(&arm_out_path) + .arg(&aarch64_out_path) + .status() { + eprintln!("Could not format `{}`: {}", arm_out_path.to_str().unwrap(), e); + eprintln!("Could not format `{}`: {}", aarch64_out_path.to_str().unwrap(), e); + }; + */ + Ok(()) +} |