// ARM Neon intrinsic specification. // // This file contains the specification for a number of // intrinsics that allows us to generate them along with // their test cases. // // To the syntax of the file - it's not very intelligently parsed! // // # Comments // start with AT LEAST two, or four or more slashes so // is a // comment /////// is too. // // # Sections // Sections start with EXACTLY three slashes followed // by AT LEAST one space. Sections are used for two things: // // 1) they serve as the doc comment for the given intrinics. // 2) they reset all variables (name, fn, etc.) // // # Variables // // name - The prefix of the function, suffixes are auto // generated by the type they get passed. // // fn - The function to call in rust-land. // // aarch64 - The intrinsic to check on aarch64 architecture. // If this is given but no arm intrinsic is provided, // the function will exclusively be generated for // aarch64. // This is used to generate both aarch64 specific and // shared intrinics by first only specifying th aarch64 // variant then the arm variant. // // arm - The arm v7 intrinics used to checked for arm code // generation. All neon functions available in arm are // also available in aarch64. If no aarch64 intrinic was // set they are assumed to be the same. // Intrinics ending with a `.` will have a size suffixes // added (such as `i8` or `i64`) that is not sign specific // Intrinics ending with a `.s` will have a size suffixes // added (such as `s8` or `u64`) that is sign specific // // a - First input for tests, it gets scaled to the size of // the type. // // b - Second input for tests, it gets scaled to the size of // the type. // // # special values // // TRUE - 'true' all bits are set to 1 // FALSE - 'false' all bits are set to 0 // FF - same as 'true' // MIN - minimal value (either 0 or the lowest negative number) // MAX - maximal value proper to overflow // // # validate // Validates a and b against the expected result of the test. // The special values 'TRUE' and 'FALSE' can be used to // represent the correct NEON representation of true or // false values. It too gets scaled to the type. // // Validate needs to be called before generate as it sets // up the rules for validation that get generated for each // type. // # generate // The generate command generates the intrinsics, it uses the // Variables set and can be called multiple times while overwriting // some of the variables. /// Vector bitwise and name = vand fn = simd_and arm = vand aarch64 = and a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Vector bitwise or (immediate, inclusive) name = vorr fn = simd_or arm = vorr aarch64 = orr a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Vector bitwise exclusive or (vector) name = veor fn = simd_xor arm = veor aarch64 = eor a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Three-way exclusive OR name = veor3 a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 c = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F target = sha3 aarch64 = eor3 link-aarch64 = llvm.aarch64.crypto.eor3s._EXT_ generate int8x16_t, int16x8_t, int32x4_t, int64x2_t link-aarch64 = llvm.aarch64.crypto.eor3u._EXT_ generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t //////////////////// // Absolute difference between the arguments //////////////////// /// Absolute difference between the arguments name = vabd a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15 arm = vabd.s aarch64 = sabd link-arm = vabds._EXT_ link-aarch64 = sabd._EXT_ generate int*_t arm = vabd.s aarch64 = uabd link-arm = vabdu._EXT_ link-aarch64 = uabd._EXT_ generate uint*_t /// Absolute difference between the arguments of Floating name = vabd a = 1.0, 2.0, 5.0, -4.0 b = 9.0, 3.0, 2.0, 8.0 validate 8.0, 1.0, 3.0, 12.0 aarch64 = fabd link-aarch64 = fabd._EXT_ generate float64x*_t arm = vabd.s aarch64 = fabd link-arm = vabds._EXT_ link-aarch64 = fabd._EXT_ generate float*_t /// Floating-point absolute difference name = vabd multi_fn = simd_extract, {vabd-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1.0 b = 9.0 validate 8.0 aarch64 = fabd generate f32, f64 //////////////////// // Absolute difference Long //////////////////// /// Unsigned Absolute difference Long name = vabdl multi_fn = simd_cast, {vabd-unsigned-noext, a, b} a = 1, 2, 3, 4, 4, 3, 2, 1 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 9, 8, 7, 6, 6, 7, 8, 9 arm = vabdl.s aarch64 = uabdl generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t /// Signed Absolute difference Long name = vabdl multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b} multi_fn = simd_cast, c a = 1, 2, 3, 4, 4, 3, 2, 1 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 9, 8, 7, 6, 6, 7, 8, 9 arm = vabdl.s aarch64 = sabdl generate int8x8_t:int8x8_t:int16x8_t /// Signed Absolute difference Long name = vabdl multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b} multi_fn = simd_cast, c a = 1, 2, 11, 12 b = 10, 10, 10, 10 validate 9, 8, 1, 2 arm = vabdl.s aarch64 = sabdl generate int16x4_t:int16x4_t:int32x4_t /// Signed Absolute difference Long name = vabdl multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b} multi_fn = simd_cast, c a = 1, 11 b = 10, 10 validate 9, 1 arm = vabdl.s aarch64 = sabdl generate int32x2_t:int32x2_t:int64x2_t /// Unsigned Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, {vabd_u8, c, d} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 validate 1, 0, 1, 2, 3, 4, 5, 6 aarch64 = uabdl generate uint8x16_t:uint8x16_t:uint16x8_t /// Unsigned Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, {vabd_u16, c, d} a = 1, 2, 3, 4, 8, 9, 11, 12 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 2, 1, 1, 2 aarch64 = uabdl generate uint16x8_t:uint16x8_t:uint32x4_t /// Unsigned Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] multi_fn = simd_cast, {vabd_u32, c, d} a = 1, 2, 3, 4 b = 10, 10, 10, 10 validate 7, 6 aarch64 = uabdl generate uint32x4_t:uint32x4_t:uint64x2_t /// Signed Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d} multi_fn = simd_cast, e a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 validate 1, 0, 1, 2, 3, 4, 5, 6 aarch64 = sabdl generate int8x16_t:int8x16_t:int16x8_t /// Signed Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d} multi_fn = simd_cast, e a = 1, 2, 3, 4, 9, 10, 11, 12 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 1, 0, 1, 2 aarch64 = sabdl generate int16x8_t:int16x8_t:int32x4_t /// Signed Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d} multi_fn = simd_cast, e a = 1, 2, 3, 4 b = 10, 10, 10, 10 validate 7, 6 aarch64 = sabdl generate int32x4_t:int32x4_t:int64x2_t //////////////////// // equality //////////////////// /// Compare bitwise Equal (vector) name = vceq fn = simd_eq a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE aarch64 = cmeq generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t arm = vceq. generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t /// Floating-point compare equal name = vceq fn = simd_eq a = 1.2, 3.4, 5.6, 7.8 b = 1.2, 3.4, 5.6, 7.8 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmeq generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vceq. generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Compare bitwise equal name = vceq multi_fn = transmute, {vceq-in_ntt-noext, {transmute, a}, {transmute, b}} a = 1 b = 2 validate 0 aarch64 = cmp generate i64:u64, u64 /// Floating-point compare equal name = vceq multi_fn = simd_extract, {vceq-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1. b = 2. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Signed compare bitwise equal to zero name = vceqz fn = simd_eq a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmeq generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t /// Unsigned compare bitwise equal to zero name = vceqz fn = simd_eq a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmeq generate uint*_t, uint64x*_t /// Floating-point compare bitwise equal to zero name = vceqz fn = simd_eq a = 0.0, 1.2, 3.4, 5.6 fixed = 0.0, 0.0, 0.0, 0.0 validate TRUE, FALSE, FALSE, FALSE aarch64 = fcmeq generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare bitwise equal to zero name = vceqz multi_fn = transmute, {vceqz-in_ntt-noext, {transmute, a}} a = 1 validate 0 aarch64 = cmp generate i64:u64, u64 /// Floating-point compare bitwise equal to zero name = vceqz multi_fn = simd_extract, {vceqz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = 1. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Signed compare bitwise Test bits nonzero name = vtst multi_fn = simd_and, c:in_t, a, b multi_fn = fixed, d:in_t multi_fn = simd_ne, c, transmute(d) a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmtst generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t arm = vtst generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly16x4_t:uint16x4_t, poly16x8_t:uint16x8_t /// Unsigned compare bitwise Test bits nonzero name = vtst multi_fn = simd_and, c:in_t, a, b multi_fn = fixed, d:in_t multi_fn = simd_ne, c, transmute(d) a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmtst generate uint64x*_t arm = vtst generate uint*_t /// Compare bitwise test bits nonzero name = vtst multi_fn = transmute, {vtst-in_ntt-noext, {transmute, a}, {transmute, b}} a = 0 b = 0 validate 0 aarch64 = tst generate i64:i64:u64, u64 /// Signed saturating accumulate of unsigned value name = vuqadd out-suffix a = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 b = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 validate 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8 aarch64 = suqadd link-aarch64 = suqadd._EXT_ generate i32:u32:i32, i64:u64:i64 /// Signed saturating accumulate of unsigned value name = vuqadd out-suffix multi_fn = simd_extract, {vuqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1 b = 2 validate 3 aarch64 = suqadd generate i8:u8:i8, i16:u16:i16 //////////////////// // Floating-point absolute value //////////////////// /// Floating-point absolute value name = vabs fn = simd_fabs a = -0.1, -2.2, -3.3, -6.6 validate 0.1, 2.2, 3.3, 6.6 aarch64 = fabs generate float64x1_t:float64x1_t, float64x2_t:float64x2_t arm = vabs generate float32x2_t:float32x2_t, float32x4_t:float32x4_t //////////////////// // greater then //////////////////// /// Compare signed greater than name = vcgt fn = simd_gt a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmgt generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcgt.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned highe name = vcgt fn = simd_gt a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhi generate uint64x*_t arm = vcgt.s generate uint*_t /// Floating-point compare greater than name = vcgt fn = simd_gt a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmgt generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcgt.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Compare greater than name = vcgt multi_fn = transmute, {vcgt-in_ntt-noext, {transmute, a}, {transmute, b}} a = 1 b = 2 validate 0 aarch64 = cmp generate i64:u64, u64 /// Floating-point compare greater than name = vcgt multi_fn = simd_extract, {vcgt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1. b = 2. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 //////////////////// // lesser then //////////////////// /// Compare signed less than name = vclt fn = simd_lt a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmgt generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcgt.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned less than name = vclt fn = simd_lt a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhi generate uint64x*_t arm = vcgt.s generate uint*_t /// Floating-point compare less than name = vclt fn = simd_lt a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmgt generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcgt.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Compare less than name = vclt multi_fn = transmute, {vclt-in_ntt-noext, {transmute, a}, {transmute, b}} a = 2 b = 1 validate 0 aarch64 = cmp generate i64:u64, u64 /// Floating-point compare less than name = vclt multi_fn = simd_extract, {vclt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 2. b = 1. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 //////////////////// // lesser then equals //////////////////// /// Compare signed less than or equal name = vcle fn = simd_le a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmge generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcge.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare greater than or equal name = vcge multi_fn = transmute, {vcge-in_ntt-noext, {transmute, a}, {transmute, b}} a = 1 b = 2 validate 0 aarch64 = cmp generate i64:u64, u64 /// Floating-point compare greater than or equal name = vcge multi_fn = simd_extract, {vcge-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1. b = 2. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Compare unsigned less than or equal name = vcle fn = simd_le a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhs generate uint64x*_t arm = vcge.s generate uint*_t /// Floating-point compare less than or equal name = vcle fn = simd_le a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmge generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcge.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Compare less than or equal name = vcle multi_fn = transmute, {vcle-in_ntt-noext, {transmute, a}, {transmute, b}} a = 2 b = 1 validate 0 aarch64 = cmp generate i64:u64, u64 /// Floating-point compare less than or equal name = vcle multi_fn = simd_extract, {vcle-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 2. b = 1. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 //////////////////// // greater then equals //////////////////// /// Compare signed greater than or equal name = vcge fn = simd_ge a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmge generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcge.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned greater than or equal name = vcge fn = simd_ge a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhs generate uint64x*_t arm = vcge.s generate uint*_t /// Floating-point compare greater than or equal name = vcge fn = simd_ge a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmge generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcge.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Compare signed greater than or equal to zero name = vcgez fn = simd_ge a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmge generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare greater than or equal to zero name = vcgez fn = simd_ge a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmge generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare signed greater than or equal to zero name = vcgez multi_fn = transmute, {vcgez-in_ntt-noext, {transmute, a}} a = -1 validate 0 aarch64 = eor generate i64:u64 /// Floating-point compare greater than or equal to zero name = vcgez multi_fn = simd_extract, {vcgez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = -1. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Compare signed greater than zero name = vcgtz fn = simd_gt a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmgt generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare greater than zero name = vcgtz fn = simd_gt a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmgt generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare signed greater than zero name = vcgtz multi_fn = transmute, {vcgtz-in_ntt-noext, {transmute, a}} a = -1 validate 0 aarch64 = cmp generate i64:u64 /// Floating-point compare greater than zero name = vcgtz multi_fn = simd_extract, {vcgtz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = -1. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Compare signed less than or equal to zero name = vclez fn = simd_le a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmgt generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare less than or equal to zero name = vclez fn = simd_le a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = fcmle generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare less than or equal to zero name = vclez multi_fn = transmute, {vclez-in_ntt-noext, {transmute, a}} a = 2 validate 0 aarch64 = cmp generate i64:u64 /// Floating-point compare less than or equal to zero name = vclez multi_fn = simd_extract, {vclez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = 2. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Compare signed less than zero name = vcltz fn = simd_lt a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmlt generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare less than zero name = vcltz fn = simd_lt a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = fcmlt generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare less than zero name = vcltz multi_fn = transmute, {vcltz-in_ntt-noext, {transmute, a}} a = 2 validate 0 aarch64 = asr generate i64:u64 /// Floating-point compare less than zero name = vcltz multi_fn = simd_extract, {vcltz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = 2. validate 0 aarch64 = fcmp generate f32:u32, f64:u64 /// Count leading sign bits name = vcls a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 arm = vcls.s aarch64 = cls link-arm = vcls._EXT_ link-aarch64 = cls._EXT_ generate int*_t /// Count leading sign bits name = vcls multi_fn = transmute, {vcls-signed-noext, {transmute, a}} a = MIN, MAX, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX validate BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1 arm = vcls aarch64 = cls generate uint8x8_t:int8x8_t, uint8x16_t:int8x16_t, uint16x4_t:int16x4_t, uint16x8_t:int16x8_t, uint32x2_t:int32x2_t, uint32x4_t:int32x4_t /// Count leading zero bits name = vclz multi_fn = self-signed-ext, a a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1 arm = vclz. aarch64 = clz generate int*_t /// Count leading zero bits name = vclz multi_fn = transmute, {self-signed-ext, transmute(a)} a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 arm = vclz. aarch64 = clz generate uint*_t /// Floating-point absolute compare greater than name = vcagt a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate !0, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE aarch64 = facgt link-aarch64 = facgt._EXT2_._EXT_ generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 arm = vacgt.s link-arm = vacgt._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point absolute compare greater than or equal name = vcage a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate !0, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE aarch64 = facge link-aarch64 = facge._EXT2_._EXT_ generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 arm = vacge.s link-arm = vacge._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point absolute compare less than name = vcalt multi_fn = vcagt-self-noext, b, a a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate 0, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE aarch64 = facgt generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 arm = vacgt.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point absolute compare less than or equal name = vcale multi_fn = vcage-self-noext , b, a a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate 0, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE aarch64 = facge generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 arm = vacge.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 0:1 validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = mov generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1., 2., 3., 4. b = 0., 0.5, 0., 0. n = 0:1 validate 0.5, 2., 3., 4. aarch64 = mov generate float32x2_t, float32x4_t, float64x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 0:1 validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = mov generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} a = 1., 2., 3., 4. b = 0., 0.5, 0., 0. n = 0:1 validate 0.5, 2., 3., 4. aarch64 = mov generate float32x2_t:float32x4_t:float32x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 0:1 validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = mov generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1:0 validate 1, MAX aarch64 = mov generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1., 2., 3., 4. b = 0.5, 0., 0., 0. n = 1:0 validate 1., 0.5, 3., 4. aarch64 = mov generate float32x4_t:float32x2_t:float32x4_t aarch64 = mov generate float64x2_t:float64x1_t:float64x2_t /// Insert vector element from another vector element name = vcreate out-suffix multi_fn = transmute, a a = 1 validate 1, 0, 0, 0, 0, 0, 0, 0 aarch64 = nop arm = nop generate u64:int8x8_t, u64:int16x4_t, u64:int32x2_t, u64:int64x1_t generate u64:uint8x8_t, u64:uint16x4_t, u64:uint32x2_t, u64:uint64x1_t generate u64:poly8x8_t, u64:poly16x4_t target = aes generate u64:poly64x1_t /// Insert vector element from another vector element name = vcreate out-suffix multi_fn = transmute, a a = 0 validate 0., 0. aarch64 = nop generate u64:float64x1_t arm = nop generate u64:float32x2_t /// Fixed-point convert to floating-point name = vcvt double-suffixes fn = simd_cast a = 1, 2, 3, 4 validate 1., 2., 3., 4. aarch64 = scvtf generate int64x1_t:float64x1_t, int64x2_t:float64x2_t aarch64 = ucvtf generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t arm = vcvt aarch64 = scvtf generate int32x2_t:float32x2_t, int32x4_t:float32x4_t aarch64 = ucvtf generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t /// Floating-point convert to higher precision long name = vcvt double-suffixes fn = simd_cast a = -1.2, 1.2 validate -1.2f32 as f64, 1.2f32 as f64 aarch64 = fcvtl generate float32x2_t:float64x2_t /// Floating-point convert to higher precision long name = vcvt_high noq-double-suffixes multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3] multi_fn = simd_cast, b a = -1.2, 1.2, 2.3, 3.4 validate 2.3f32 as f64, 3.4f32 as f64 aarch64 = fcvtl generate float32x4_t:float64x2_t /// Floating-point convert to lower precision narrow name = vcvt double-suffixes fn = simd_cast a = -1.2, 1.2 validate -1.2f64 as f32, 1.2f64 as f32 aarch64 = fcvtn generate float64x2_t:float32x2_t /// Floating-point convert to lower precision narrow name = vcvt_high noq-double-suffixes multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3] a = -1.2, 1.2 b = -2.3, 3.4 validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32 aarch64 = fcvtn generate float32x2_t:float64x2_t:float32x4_t /// Floating-point convert to lower precision narrow, rounding to odd name = vcvtx double-suffixes a = -1.0, 2.0 validate -1.0, 2.0 aarch64 = fcvtxn link-aarch64 = fcvtxn._EXT2_._EXT_ generate float64x2_t:float32x2_t /// Floating-point convert to lower precision narrow, rounding to odd name = vcvtx double-suffixes multi_fn = simd_extract, {vcvtx-_f32_f64-noext, {vdupq_n-in_ntt-noext, a}}, 0 a = -1.0 validate -1.0 aarch64 = fcvtxn generate f64:f32 /// Floating-point convert to lower precision narrow, rounding to odd name = vcvtx_high noq-double-suffixes multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3] a = -1.0, 2.0 b = -3.0, 4.0 validate -1.0, 2.0, -3.0, 4.0 aarch64 = fcvtxn generate float32x2_t:float64x2_t:float32x4_t /// Fixed-point convert to floating-point name = vcvt double-n-suffixes constn = N multi_fn = static_assert-N-1-bits a = 1, 2, 3, 4 n = 2 validate 0.25, 0.5, 0.75, 1. arm-aarch64-separate aarch64 = scvtf link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ const-aarch64 = N generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64 aarch64 = ucvtf link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ const-aarch64 = N generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64 aarch64 = scvtf link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ arm = vcvt link-arm = vcvtfxs2fp._EXT2_._EXT_ const-arm = N:i32 generate int32x2_t:float32x2_t, int32x4_t:float32x4_t aarch64 = ucvtf link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ arm = vcvt link-arm = vcvtfxu2fp._EXT2_._EXT_ const-arm = N:i32 generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t /// Floating-point convert to fixed-point, rounding toward zero name = vcvt double-n-suffixes constn = N multi_fn = static_assert-N-1-bits a = 0.25, 0.5, 0.75, 1. n = 2 validate 1, 2, 3, 4 arm-aarch64-separate aarch64 = fcvtzs link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ const-aarch64 = N generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 aarch64 = fcvtzu link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ const-aarch64 = N generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 aarch64 = fcvtzs link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ arm = vcvt link-arm = vcvtfp2fxs._EXT2_._EXT_ const-arm = N:i32 generate float32x2_t:int32x2_t, float32x4_t:int32x4_t aarch64 = fcvtzu link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ arm = vcvt link-arm = vcvtfp2fxu._EXT2_._EXT_ const-arm = N:i32 generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Fixed-point convert to floating-point name = vcvt double-suffixes multi_fn = a as out_t a = 1 validate 1. aarch64 = scvtf generate i32:f32, i64:f64 aarch64 = ucvtf generate u32:f32, u64:f64 /// Fixed-point convert to floating-point name = vcvt double-suffixes multi_fn = a as out_t a = 1. validate 1 aarch64 = fcvtzs generate f32:i32, f64:i64 aarch64 = fcvtzu generate f32:u32, f64:u64 /// Floating-point convert to signed fixed-point, rounding toward zero name = vcvt double-suffixes link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_ a = -1.1, 2.1, -2.9, 3.9 validate -1, 2, -2, 3 aarch64 = fcvtzs generate float64x1_t:int64x1_t, float64x2_t:int64x2_t link-arm = llvm.fptosi.sat._EXT2_._EXT_ arm = vcvt generate float32x2_t:int32x2_t, float32x4_t:int32x4_t /// Floating-point convert to unsigned fixed-point, rounding toward zero name = vcvt double-suffixes link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_ a = 1.1, 2.1, 2.9, 3.9 validate 1, 2, 2, 3 aarch64 = fcvtzu generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t link-arm = llvm.fptoui.sat._EXT2_._EXT_ arm = vcvt generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point convert to signed integer, rounding to nearest with ties to away name = vcvta double-suffixes a = -1.1, 2.1, -2.9, 3.9 validate -1, 2, -3, 4 aarch64 = fcvtas link-aarch64 = fcvtas._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t /// Floating-point convert to integer, rounding to nearest with ties to away name = vcvta double-suffixes a = 2.9 validate 3 aarch64 = fcvtas link-aarch64 = fcvtas._EXT2_._EXT_ generate f32:i32, f64:i64 aarch64 = fcvtau link-aarch64 = fcvtau._EXT2_._EXT_ generate f32:u32, f64:u64 /// Floating-point convert to signed integer, rounding to nearest with ties to even name = vcvtn double-suffixes a = -1.5, 2.1, -2.9, 3.9 validate -2, 2, -3, 4 aarch64 = fcvtns link-aarch64 = fcvtns._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 /// Floating-point convert to signed integer, rounding toward minus infinity name = vcvtm double-suffixes a = -1.1, 2.1, -2.9, 3.9 validate -2, 2, -3, 3 aarch64 = fcvtms link-aarch64 = fcvtms._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 /// Floating-point convert to signed integer, rounding toward plus infinity name = vcvtp double-suffixes a = -1.1, 2.1, -2.9, 3.9 validate -1, 3, -2, 4 aarch64 = fcvtps link-aarch64 = fcvtps._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 /// Floating-point convert to unsigned integer, rounding to nearest with ties to away name = vcvta double-suffixes a = 1.1, 2.1, 2.9, 3.9 validate 1, 2, 3, 4 aarch64 = fcvtau link-aarch64 = fcvtau._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Floating-point convert to unsigned integer, rounding to nearest with ties to even name = vcvtn double-suffixes a = 1.5, 2.1, 2.9, 3.9 validate 2, 2, 3, 4 aarch64 = fcvtnu link-aarch64 = fcvtnu._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 /// Floating-point convert to unsigned integer, rounding toward minus infinity name = vcvtm double-suffixes a = 1.1, 2.1, 2.9, 3.9 validate 1, 2, 2, 3 aarch64 = fcvtmu link-aarch64 = fcvtmu._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 /// Floating-point convert to unsigned integer, rounding toward plus infinity name = vcvtp double-suffixes a = 1.1, 2.1, 2.9, 3.9 validate 2, 3, 3, 4 aarch64 = fcvtpu link-aarch64 = fcvtpu._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 n = HFLEN validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = dup generate poly64x2_t, poly64x1_t:poly64x2_t arm = vdup.l generate int*_t generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t generate uint*_t generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 n = HFLEN validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = dup arm = vmov generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} a = 1., 1., 1., 4. n = HFLEN validate 1., 1., 1., 1. aarch64 = dup generate float64x2_t, float64x1_t:float64x2_t arm = vdup.l generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = a a = 0 n = HFLEN validate 0 aarch64 = nop generate poly64x1_t arm = nop generate int64x1_t, uint64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = a a = 0. n = HFLEN validate 0. aarch64 = nop generate float64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = transmute--, {simd_extract, a, N as u32} a = 0, 1 n = HFLEN validate 1 aarch64 = nop generate poly64x2_t:poly64x1_t arm = vmov generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = transmute--, {simd_extract, a, N as u32} a = 0., 1. n = HFLEN validate 1. aarch64 = nop generate float64x2_t:float64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, a, N as u32 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 n = HFLEN validate 1 aarch64 = nop generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64 generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64 generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16 /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, a, N as u32 a = 1., 1., 1., 4. n = HFLEN validate 1. aarch64 = nop generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64 /// Extract vector from pair of vectors name = vext constn = N multi_fn = static_assert_imm-out_exp_len-N multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 n = HFLEN validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 arm = "vext.8" aarch64 = ext generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t /// Extract vector from pair of vectors name = vext constn = N multi_fn = static_assert_imm-out_exp_len-N multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 n = HFLEN validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 aarch64 = ext generate poly64x2_t arm = vmov generate int64x2_t, uint64x2_t /// Extract vector from pair of vectors name = vext constn = N multi_fn = static_assert_imm-out_exp_len-N multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} a = 0., 2., 2., 3. b = 3., 4., 5., 6., n = HFLEN validate 2., 3., 3., 4. aarch64 = ext generate float64x2_t arm = "vext.8" generate float*_t /// Multiply-add to accumulator name = vmla multi_fn = simd_add, a, {simd_mul, b, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmla. aarch64 = mla generate int*_t, uint*_t /// Floating-point multiply-add to accumulator name = vmla multi_fn = simd_add, a, {simd_mul, b, c} a = 0., 1., 2., 3. b = 2., 2., 2., 2. c = 3., 3., 3., 3. validate 6., 7., 8., 9. aarch64 = fmul generate float64x*_t arm = vmla. generate float*_t /// Vector multiply accumulate with scalar name = vmla n-suffix multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 aarch64 = mla arm = vmla. generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t /// Vector multiply accumulate with scalar name = vmla n-suffix multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} a = 0., 1., 2., 3. b = 2., 2., 2., 2. c = 3. validate 6., 7., 8., 9. aarch64 = fmul arm = vmla. generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Vector multiply accumulate with scalar name = vmla in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 aarch64 = mla arm = vmla. generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t /// Vector multiply accumulate with scalar name = vmla in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 0., 1., 2., 3. b = 2., 2., 2., 2. c = 0., 3., 0., 0. n = 1 validate 6., 7., 8., 9. aarch64 = fmul arm = vmla. generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Signed multiply-add long name = vmlal multi_fn = simd_add, a, {vmull-self-noext, b, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = smlal generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Unsigned multiply-add long name = vmlal multi_fn = simd_add, a, {vmull-self-noext, b, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = umlal generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t /// Vector widening multiply accumulate with scalar name = vmlal n-suffix multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = smlal generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t aarch64 = umlal generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t /// Vector widening multiply accumulate with scalar name = vmlal_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = smlal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t aarch64 = umlal generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t /// Signed multiply-add long name = vmlal_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlal-noqself-noext, a, b, c a = 8, 7, 6, 5, 4, 3, 2, 1 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = smlal2 generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Unsigned multiply-add long name = vmlal_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlal-noqself-noext, a, b, c a = 8, 7, 6, 5, 4, 3, 2, 1 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = umlal2 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Multiply-add long name = vmlal_high_n no-q multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} a = 8, 7, 6, 5, 4, 3, 2, 1 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 2 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = smlal2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t aarch64 = umlal2 generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t /// Multiply-add long name = vmlal_high_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 8, 7, 6, 5, 4, 3, 2, 1 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = smlal2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t aarch64 = umlal2 generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Multiply-subtract from accumulator name = vmls multi_fn = simd_sub, a, {simd_mul, b, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmls. aarch64 = mls generate int*_t, uint*_t /// Floating-point multiply-subtract from accumulator name = vmls multi_fn = simd_sub, a, {simd_mul, b, c} a = 6., 7., 8., 9. b = 2., 2., 2., 2. c = 3., 3., 3., 3. validate 0., 1., 2., 3. aarch64 = fmul generate float64x*_t arm = vmls. generate float*_t /// Vector multiply subtract with scalar name = vmls n-suffix multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = mls arm = vmls. generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t /// Vector multiply subtract with scalar name = vmls n-suffix multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} a = 6., 7., 8., 9. b = 2., 2., 2., 2. c = 3. validate 0., 1., 2., 3. aarch64 = fmul arm = vmls. generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Vector multiply subtract with scalar name = vmls in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = mls arm = vmls. generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t /// Vector multiply subtract with scalar name = vmls in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 6., 7., 8., 9. b = 2., 2., 2., 2. c = 0., 3., 0., 0. n = 1 validate 0., 1., 2., 3. aarch64 = fmul arm = vmls. generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Signed multiply-subtract long name = vmlsl multi_fn = simd_sub, a, {vmull-self-noext, b, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = smlsl generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Unsigned multiply-subtract long name = vmlsl multi_fn = simd_sub, a, {vmull-self-noext, b, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = umlsl generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t /// Vector widening multiply subtract with scalar name = vmlsl n-suffix multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = smlsl generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t aarch64 = umlsl generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t /// Vector widening multiply subtract with scalar name = vmlsl_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = smlsl generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t aarch64 = umlsl generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t /// Signed multiply-subtract long name = vmlsl_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlsl-noqself-noext, a, b, c a = 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = smlsl2 generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Unsigned multiply-subtract long name = vmlsl_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlsl-noqself-noext, a, b, c a = 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = umlsl2 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Multiply-subtract long name = vmlsl_high_n no-q multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} a = 14, 15, 16, 17, 18, 19, 20, 21 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 2 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = smlsl2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t aarch64 = umlsl2 generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t /// Multiply-subtract long name = vmlsl_high_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 14, 15, 16, 17, 18, 19, 20, 21 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = smlsl2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t aarch64 = umlsl2 generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Extract narrow name = vmovn_high no-q multi_fn = simd_cast, c:in_t0, b multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 4, 5 b = 2, 3, 4, 5, 12, 13, 14, 15 validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15 aarch64 = xtn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Negate name = vneg fn = simd_neg a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8 validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8 aarch64 = neg generate int64x*_t arm = vneg.s generate int*_t /// Negate name = vneg multi_fn = a.wrapping_neg() a = 1 validate -1 aarch64 = neg generate i64 /// Negate name = vneg fn = simd_neg a = 0., 1., -1., 2., -2., 3., -3., 4. validate 0., -1., 1., -2., 2., -3., 3., -4. aarch64 = fneg generate float64x*_t arm = vneg.s generate float*_t /// Signed saturating negate name = vqneg a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7 validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7 link-arm = vqneg._EXT_ link-aarch64 = sqneg._EXT_ aarch64 = sqneg generate int64x*_t arm = vqneg.s generate int*_t /// Signed saturating negate name = vqneg multi_fn = simd_extract, {vqneg-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = 1 validate -1 aarch64 = sqneg generate i8, i16, i32, i64 /// Saturating subtract name = vqsub a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 arm = vqsub.s aarch64 = uqsub link-arm = llvm.usub.sat._EXT_ link-aarch64 = uqsub._EXT_ generate uint*_t, uint64x*_t arm = vqsub.s aarch64 = sqsub link-arm = llvm.ssub.sat._EXT_ link-aarch64 = sqsub._EXT_ generate int*_t, int64x*_t /// Saturating subtract name = vqsub multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0 a = 42 b = 1 validate 41 aarch64 = sqsub generate i8, i16 aarch64 = uqsub generate u8, u16 /// Saturating subtract name = vqsub a = 42 b = 1 validate 41 aarch64 = uqsub link-aarch64 = uqsub._EXT_ generate u32, u64 aarch64 = sqsub link-aarch64 = sqsub._EXT_ generate i32, i64 /// Halving add name = vhadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29 arm = vhadd.s aarch64 = uhadd link-aarch64 = uhadd._EXT_ link-arm = vhaddu._EXT_ generate uint*_t arm = vhadd.s aarch64 = shadd link-aarch64 = shadd._EXT_ link-arm = vhadds._EXT_ generate int*_t /// Reverse bit order name = vrbit a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 aarch64 = rbit link-aarch64 = rbit._EXT_ generate int8x8_t, int8x16_t /// Reverse bit order name = vrbit multi_fn = transmute, {vrbit-signed-noext, transmute(a)} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 aarch64 = rbit generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t /// Rounding halving add name = vrhadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29 arm = vrhadd.s aarch64 = urhadd link-arm = vrhaddu._EXT_ link-aarch64 = urhadd._EXT_ generate uint*_t arm = vrhadd.s aarch64 = srhadd link-arm = vrhadds._EXT_ link-aarch64 = srhadd._EXT_ generate int*_t /// Floating-point round to integral exact, using current rounding mode name = vrndx a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 2.0, 2.0 aarch64 = frintx link-aarch64 = llvm.rint._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, to nearest with ties to away name = vrnda a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 1.0, 2.0, 3.0 aarch64 = frinta link-aarch64 = llvm.round._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, to nearest with ties to even name = vrndn a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 2.0, 2.0 link-aarch64 = frintn._EXT_ aarch64 = frintn generate float64x*_t target = fp-armv8 arm = vrintn link-arm = vrintn._EXT_ generate float*_t /// Floating-point round to integral, to nearest with ties to even name = vrndn a = -1.5 validate -2.0 aarch64 = frintn link-aarch64 = llvm.roundeven._EXT_ generate f32 /// Floating-point round to integral, toward minus infinity name = vrndm a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 1.0, 2.0 aarch64 = frintm link-aarch64 = llvm.floor._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, toward plus infinity name = vrndp a = -1.5, 0.5, 1.5, 2.5 validate -1.0, 1.0, 2.0, 3.0 aarch64 = frintp link-aarch64 = llvm.ceil._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, toward zero name = vrnd a = -1.5, 0.5, 1.5, 2.5 validate -1.0, 0.0, 1.0, 2.0 aarch64 = frintz link-aarch64 = llvm.trunc._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, using current rounding mode name = vrndi a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 2.0, 2.0 aarch64 = frinti link-aarch64 = llvm.nearbyint._EXT_ generate float*_t, float64x*_t /// Saturating add name = vqadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 arm = vqadd.s aarch64 = uqadd link-arm = llvm.uadd.sat._EXT_ link-aarch64 = uqadd._EXT_ generate uint*_t, uint64x*_t arm = vqadd.s aarch64 = sqadd link-arm = llvm.sadd.sat._EXT_ link-aarch64 = sqadd._EXT_ generate int*_t, int64x*_t /// Saturating add name = vqadd multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0 a = 42 b = 1 validate 43 aarch64 = sqadd generate i8, i16 aarch64 = uqadd generate u8, u16 /// Saturating add name = vqadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 aarch64 = uqadd link-aarch64 = uqadd._EXT_ generate u32, u64 aarch64 = sqadd link-aarch64 = sqadd._EXT_ generate i32, i64 /// Load multiple single-element structures to one, two, three, or four registers name = vld1 out-suffix a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 load_fn aarch64 = ld1 link-aarch64 = ld1x2._EXT2_ arm = vld1 link-arm = vld1x2._EXT2_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t link-aarch64 = ld1x3._EXT2_ link-arm = vld1x3._EXT2_ generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t link-aarch64 = ld1x4._EXT2_ link-arm = vld1x4._EXT2_ generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t /// Load multiple single-element structures to one, two, three, or four registers name = vld1 out-suffix multi_fn = transmute, {vld1-outsigned-noext, transmute(a)} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 load_fn aarch64 = ld1 arm = vld1 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t target = aes generate *const p64:poly64x1x2_t arm = nop generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t /// Load multiple single-element structures to one, two, three, or four registers name = vld1 out-suffix a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. load_fn aarch64 = ld1 link-aarch64 = ld1x2._EXT2_ generate *const f64:float64x1x2_t, *const f64:float64x2x2_t link-aarch64 = ld1x3._EXT2_ generate *const f64:float64x1x3_t, *const f64:float64x2x3_t link-aarch64 = ld1x4._EXT2_ generate *const f64:float64x1x4_t, *const f64:float64x2x4_t arm = vld1 link-aarch64 = ld1x2._EXT2_ link-arm = vld1x2._EXT2_ generate *const f32:float32x2x2_t, *const f32:float32x4x2_t link-aarch64 = ld1x3._EXT2_ link-arm = vld1x3._EXT2_ generate *const f32:float32x2x3_t, *const f32:float32x4x3_t link-aarch64 = ld1x4._EXT2_ link-arm = vld1x4._EXT2_ generate *const f32:float32x2x4_t, *const f32:float32x4x4_t /// Load multiple 2-element structures to two registers name = vld2 out-nox a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 load_fn arm-aarch64-separate aarch64 = ld2 link-aarch64 = ld2._EXTv2_ generate *const i64:int64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t arm = nop aarch64 = nop generate *const i64:int64x1x2_t /// Load multiple 2-element structures to two registers name = vld2 out-nox multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)} a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 load_fn aarch64 = ld2 generate *const u64:uint64x2x2_t target = aes generate *const p64:poly64x2x2_t target = default arm = vld2 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t arm = nop aarch64 = nop generate *const u64:uint64x1x2_t target = aes generate *const p64:poly64x1x2_t /// Load multiple 2-element structures to two registers name = vld2 out-nox a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. load_fn arm-aarch64-separate aarch64 = nop link-aarch64 = ld2._EXTv2_ generate *const f64:float64x1x2_t aarch64 = ld2 generate *const f64:float64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ generate *const f32:float32x2x2_t, *const f32:float32x4x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 out-dup-nox a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn arm-aarch64-separate aarch64 = ld2r link-aarch64 = ld2r._EXT2_ generate *const i64:int64x2x2_t arm = vld2 link-arm = vld2dup._EXTpi82_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t arm = nop generate *const i64:int64x1x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 out-dup-nox multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)} a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn aarch64 = ld2r generate *const u64:uint64x2x2_t target = aes generate *const p64:poly64x2x2_t target = default arm = vld2 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t arm = nop generate *const u64:uint64x1x2_t target = aes generate *const p64:poly64x1x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 out-dup-nox a = 0., 1., 1., 2., 3., 1., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1. load_fn arm-aarch64-separate aarch64 = ld2r link-aarch64 = ld2r._EXT2_ generate *const f64:float64x1x2_t, *const f64:float64x2x2_t arm = vld2 link-arm = vld2dup._EXTpi82_ generate *const f32:float32x2x2_t, *const f32:float32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 load_fn arm-aarch64-separate aarch64 = ld2 const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t arm = vld2 const-arm = LANE link-arm = vld2lane._EXTpi82_ generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vld2-outsignedlanenox-::, transmute(a), transmute(b)} constn = LANE a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 load_fn aarch64 = ld2 const-aarch64 = LANE target = aes generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t target = default generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t generate *const p8:poly8x16x2_t:poly8x16x2_t arm = vld2 const-arm = LANE generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t generate *const p16:poly16x8x2_t:poly16x8x2_t /// Load multiple 2-element structures to two registers name = vld2 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0., 1., 2., 3., 4., 5., 6., 7., 8. b = 0., 2., 2., 14., 2., 16., 17., 18. n = 0 validate 1., 2., 2., 14., 2., 16., 17., 18. load_fn arm-aarch64-separate aarch64 = ld2 const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t arm = vld2 const-arm = LANE link-arm = vld2lane._EXTpi82_ generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t /// Load multiple 3-element structures to three registers name = vld3 out-nox a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 load_fn arm-aarch64-separate aarch64 = ld3 link-aarch64 = ld3._EXTv2_ generate *const i64:int64x2x3_t arm = vld3 link-arm = vld3._EXTpi82_ generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t arm = nop aarch64 = nop generate *const i64:int64x1x3_t /// Load multiple 3-element structures to three registers name = vld3 out-nox multi_fn = transmute, {vld3-outsignednox-noext, transmute(a)} a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 load_fn aarch64 = ld3 generate *const u64:uint64x2x3_t target = aes generate *const p64:poly64x2x3_t target = default arm = vld3 generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t arm = nop aarch64 = nop generate *const u64:uint64x1x3_t target = aes generate *const p64:poly64x1x3_t /// Load multiple 3-element structures to three registers name = vld3 out-nox a = 0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8. validate 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8. load_fn arm-aarch64-separate aarch64 = nop link-aarch64 = ld3._EXTv2_ generate *const f64:float64x1x3_t aarch64 = ld3 generate *const f64:float64x2x3_t arm = vld3 link-arm = vld3._EXTpi82_ generate *const f32:float32x2x3_t, *const f32:float32x4x3_t /// Load single 3-element structure and replicate to all lanes of three registers name = vld3 out-dup-nox a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn arm-aarch64-separate aarch64 = ld3r link-aarch64 = ld3r._EXT2_ generate *const i64:int64x2x3_t arm = vld3 link-arm = vld3dup._EXTpi82_ generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t arm = nop generate *const i64:int64x1x3_t /// Load single 3-element structure and replicate to all lanes of three registers name = vld3 out-dup-nox multi_fn = transmute, {vld3-outsigneddupnox-noext, transmute(a)} a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn aarch64 = ld3r generate *const u64:uint64x2x3_t target = aes generate *const p64:poly64x2x3_t target = default arm = vld3 generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t arm = nop generate *const u64:uint64x1x3_t target = aes generate *const p64:poly64x1x3_t /// Load single 3-element structure and replicate to all lanes of three registers name = vld3 out-dup-nox a = 0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. load_fn arm-aarch64-separate aarch64 = ld3r link-aarch64 = ld3r._EXT2_ generate *const f64:float64x1x3_t, *const f64:float64x2x3_t arm = vld3 link-arm = vld3dup._EXTpi82_ generate *const f32:float32x2x3_t, *const f32:float32x4x3_t /// Load multiple 3-element structures to two registers name = vld3 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 load_fn arm-aarch64-separate aarch64 = ld3 const-aarch64 = LANE link-aarch64 = ld3lane._EXTpi82_ generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t arm = vld3 const-arm = LANE link-arm = vld3lane._EXTpi82_ generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t /// Load multiple 3-element structures to three registers name = vld3 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vld3-outsignedlanenox-::, transmute(a), transmute(b)} constn = LANE a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 load_fn aarch64 = ld3 const-aarch64 = LANE target = aes generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t target = default generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t arm = vld3 const-arm = LANE generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t generate *const p16:poly16x8x3_t:poly16x8x3_t /// Load multiple 3-element structures to three registers name = vld3 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8. b = 0., 2., 2., 14., 9., 16., 17., 18., 5., 6., 7., 8. n = 0 validate 1., 2., 2., 14., 2., 16., 17., 18., 2., 6., 7., 8. load_fn arm-aarch64-separate aarch64 = ld3 const-aarch64 = LANE link-aarch64 = ld3lane._EXTpi82_ generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t arm = vld3 const-arm = LANE link-arm = vld3lane._EXTpi82_ generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t /// Load multiple 4-element structures to four registers name = vld4 out-nox a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 load_fn arm-aarch64-separate aarch64 = ld4 link-aarch64 = ld4._EXTv2_ generate *const i64:int64x2x4_t arm = vld4 link-arm = vld4._EXTpi82_ generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t aarch64 = nop arm = nop generate *const i64:int64x1x4_t /// Load multiple 4-element structures to four registers name = vld4 out-nox multi_fn = transmute, {vld4-outsignednox-noext, transmute(a)} a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 load_fn aarch64 = ld4 generate *const u64:uint64x2x4_t target = aes generate *const p64:poly64x2x4_t target = default arm = vld4 generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t aarch64 = nop arm = nop generate *const u64:uint64x1x4_t target = aes generate *const p64:poly64x1x4_t /// Load multiple 4-element structures to four registers name = vld4 out-nox a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16. validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 15., 6., 8., 8., 16. load_fn arm-aarch64-separate aarch64 = nop link-aarch64 = ld4._EXTv2_ generate *const f64:float64x1x4_t aarch64 = ld4 generate *const f64:float64x2x4_t arm = vld4 link-arm = vld4._EXTpi82_ generate *const f32:float32x2x4_t, *const f32:float32x4x4_t /// Load single 4-element structure and replicate to all lanes of four registers name = vld4 out-dup-nox a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn arm-aarch64-separate aarch64 = ld4r link-aarch64 = ld4r._EXT2_ generate *const i64:int64x2x4_t arm = vld4 link-arm = vld4dup._EXTpi82_ generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t arm = nop generate *const i64:int64x1x4_t /// Load single 4-element structure and replicate to all lanes of four registers name = vld4 out-dup-nox multi_fn = transmute, {vld4-outsigneddupnox-noext, transmute(a)} a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn aarch64 = ld4r generate *const u64:uint64x2x4_t target = aes generate *const p64:poly64x2x4_t target = default arm = vld4 generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t arm = nop generate *const u64:uint64x1x4_t target = aes generate *const p64:poly64x1x4_t /// Load single 4-element structure and replicate to all lanes of four registers name = vld4 out-dup-nox a = 0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5., 9., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. load_fn arm-aarch64-separate aarch64 = ld4r link-aarch64 = ld4r._EXT2_ generate *const f64:float64x1x4_t, *const f64:float64x2x4_t arm = vld4 link-arm = vld4dup._EXTpi82_ generate *const f32:float32x2x4_t, *const f32:float32x4x4_t /// Load multiple 4-element structures to four registers name = vld4 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 n = 0 validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 load_fn arm-aarch64-separate aarch64 = ld4 const-aarch64 = LANE link-aarch64 = ld4lane._EXTpi82_ generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t arm = vld4 const-arm = LANE link-arm = vld4lane._EXTpi82_ generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t /// Load multiple 4-element structures to four registers name = vld4 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vld4-outsignedlanenox-::, transmute(a), transmute(b)} constn = LANE a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 n = 0 validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 load_fn aarch64 = ld4 const-aarch64 = LANE target = aes generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t target = default generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t arm = vld4 const-arm = LANE generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t generate *const p16:poly16x8x4_t:poly16x8x4_t /// Load multiple 4-element structures to four registers name = vld4 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5. b = 0., 2., 2., 2., 2., 16., 2., 18., 5., 6., 7., 8., 1., 4., 3., 5. n = 0 validate 1., 2., 2., 2., 2., 16., 2., 18., 2., 6., 7., 8., 2., 4., 3., 5. load_fn arm-aarch64-separate aarch64 = ld4 const-aarch64 = LANE link-aarch64 = ld4lane._EXTpi82_ generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t arm = vld4 const-arm = LANE link-arm = vld4lane._EXTpi82_ generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t /// Store multiple single-element structures from one, two, three, or four registers name = vst1 in1-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = *a, {simd_extract, b, LANE as u32} constn = LANE a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 0 validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn aarch64 = nop arm = nop generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void target = aes generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void /// Store multiple single-element structures from one, two, three, or four registers name = vst1 in1-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = *a, {simd_extract, b, LANE as u32} constn = LANE a = 0., 1., 2., 3., 4., 5., 6., 7., 8. n = 0 validate 1., 0., 0., 0., 0., 0., 0., 0. store_fn aarch64 = nop generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void arm = nop generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void /// Store multiple single-element structures from one, two, three, or four registers name = vst1 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 store_fn arm-aarch64-separate aarch64 = st1 link-aarch64 = st1x2._EXT3_ arm = vst1 link-arm = vst1x2._EXTr3_ generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void link-aarch64 = st1x3._EXT3_ link-arm = vst1x3._EXTr3_ generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void link-aarch64 = st1x4._EXT3_ link-arm = vst1x4._EXTr3_ generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void /// Store multiple single-element structures to one, two, three, or four registers name = vst1 multi_fn = vst1-signed-noext, transmute(a), transmute(b) a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 store_fn aarch64 = st1 arm = vst1 generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void target = aes generate *mut p64:poly64x1x2_t:void arm = nop generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x1x4_t:void generate *mut p64:poly64x2x2_t:void, *mut p64:poly64x2x3_t:void, *mut p64:poly64x2x4_t:void /// Store multiple single-element structures to one, two, three, or four registers name = vst1 a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. store_fn arm-aarch64-separate aarch64 = st1 link-aarch64 = st1x2._EXT3_ generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void link-aarch64 = st1x3._EXT3_ generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void link-aarch64 = st1x4._EXT3_ generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void arm = vst1 link-aarch64 = st1x2._EXT3_ link-arm = vst1x2._EXTr3_ generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void link-aarch64 = st1x3._EXT3_ link-arm = vst1x3._EXTr3_ generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void link-aarch64 = st1x4._EXT3_ link-arm = vst1x4._EXTr3_ generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Store multiple 2-element structures from two registers name = vst2 in1-nox a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 store_fn arm-aarch64-separate aarch64 = st2 link-aarch64 = st2._EXTpi8_ generate *mut i64:int64x2x2_t:void arm = vst2 link-arm = vst2._EXTpi8r_ generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void arm = nop aarch64 = nop generate *mut i64:int64x1x2_t:void /// Store multiple 2-element structures from two registers name = vst2 multi_fn = transmute, {vst2-in1signednox-noext, transmute(a), transmute(b)} in1-nox a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 store_fn aarch64 = st2 generate *mut u64:uint64x2x2_t:void target = aes generate *mut p64:poly64x2x2_t:void target = default arm = vst2 generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void arm = nop aarch64 = nop generate *mut u64:uint64x1x2_t:void target = aes generate *mut p64:poly64x1x2_t:void /// Store multiple 2-element structures from two registers name = vst2 in1-nox a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. validate 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. store_fn arm-aarch64-separate aarch64 = st1 link-aarch64 = st2._EXTpi8_ generate *mut f64:float64x1x2_t:void aarch64 = st2 generate *mut f64:float64x2x2_t:void arm = vst2 link-arm = vst2._EXTpi8r_ generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void /// Store multiple 2-element structures from two registers name = vst2 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 n = 0 validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn arm-aarch64-separate aarch64 = st2 link-aarch64 = st2lane._EXTpi8_ const-aarch64 = LANE generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void arm = vst2 link-arm = vst2lane._EXTpi8r_ const-arm = LANE generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void /// Store multiple 2-element structures from two registers name = vst2 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vst2-in1signedlanenox-::, transmute(a), transmute(b)} a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 n = 0 validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn aarch64 = st2 generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void target = aes generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void target = default arm = vst2 generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void /// Store multiple 2-element structures from two registers name = vst2 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. n = 0 validate 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. store_fn arm-aarch64-separate aarch64 = st2 link-aarch64 = st2lane._EXTpi8_ const-aarch64 = LANE generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void arm = vst2 link-arm = vst2lane._EXTpi8r_ const-arm = LANE generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void /// Store multiple 3-element structures from three registers name = vst3 in1-nox a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 store_fn arm-aarch64-separate aarch64 = st3 link-aarch64 = st3._EXTpi8_ generate *mut i64:int64x2x3_t:void arm = vst3 link-arm = vst3._EXTpi8r_ generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void arm = nop aarch64 = nop generate *mut i64:int64x1x3_t:void /// Store multiple 3-element structures from three registers name = vst3 multi_fn = transmute, {vst3-in1signednox-noext, transmute(a), transmute(b)} in1-nox a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 store_fn aarch64 = st3 generate *mut u64:uint64x2x3_t:void target = aes generate *mut p64:poly64x2x3_t:void target = default arm = vst3 generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void arm = nop aarch64 = nop generate *mut u64:uint64x1x3_t:void target = aes generate *mut p64:poly64x1x3_t:void /// Store multiple 3-element structures from three registers name = vst3 in1-nox a = 0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8., 13., 14., 15., 16 validate 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8., 2., 13., 13., 4. store_fn arm-aarch64-separate aarch64 = nop link-aarch64 = st3._EXTpi8_ generate *mut f64:float64x1x3_t:void aarch64 = st3 generate *mut f64:float64x2x3_t:void arm = vst3 link-arm = vst3._EXTpi8r_ generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void /// Store multiple 3-element structures from three registers name = vst3 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 n = 0 validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn arm-aarch64-separate aarch64 = st3 link-aarch64 = st3lane._EXTpi8_ const-aarch64 = LANE generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void arm = vst3 link-arm = vst3lane._EXTpi8r_ const-arm = LANE generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void /// Store multiple 3-element structures from three registers name = vst3 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vst3-in1signedlanenox-::, transmute(a), transmute(b)} a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 n = 0 validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn aarch64 = st3 generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void target = aes generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void target = default arm = vst3 generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void /// Store multiple 3-element structures from three registers name = vst3 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. n = 0 validate 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. store_fn arm-aarch64-separate aarch64 = st3 link-aarch64 = st3lane._EXTpi8_ const-aarch64 = LANE generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void arm = vst3 link-arm = vst3lane._EXTpi8r_ const-arm = LANE generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void /// Store multiple 4-element structures from four registers name = vst4 in1-nox a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 store_fn arm-aarch64-separate aarch64 = st4 link-aarch64 = st4._EXTpi8_ generate *mut i64:int64x2x4_t:void arm = vst4 link-arm = vst4._EXTpi8r_ generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void arm = nop aarch64 = nop generate *mut i64:int64x1x4_t:void /// Store multiple 4-element structures from four registers name = vst4 multi_fn = transmute, {vst4-in1signednox-noext, transmute(a), transmute(b)} in1-nox a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 store_fn aarch64 = st4 generate *mut u64:uint64x2x4_t:void target = aes generate *mut p64:poly64x2x4_t:void target = default arm = vst4 generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void arm = nop aarch64 = nop generate *mut u64:uint64x1x4_t:void target = aes generate *mut p64:poly64x1x4_t:void /// Store multiple 4-element structures from four registers name = vst4 in1-nox a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. store_fn arm-aarch64-separate aarch64 = nop link-aarch64 = st4._EXTpi8_ generate *mut f64:float64x1x4_t:void aarch64 = st4 generate *mut f64:float64x2x4_t:void arm = vst4 link-arm = vst4._EXTpi8r_ generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Store multiple 4-element structures from four registers name = vst4 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 n = 0 validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn arm-aarch64-separate aarch64 = st4 link-aarch64 = st4lane._EXTpi8_ const-aarch64 = LANE generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void arm = vst4 link-arm = vst4lane._EXTpi8r_ const-arm = LANE generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void /// Store multiple 4-element structures from four registers name = vst4 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vst4-in1signedlanenox-::, transmute(a), transmute(b)} a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 n = 0 validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn aarch64 = st4 generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void target = aes generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void target = default arm = vst4 generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void /// Store multiple 4-element structures from four registers name = vst4 in1-lane-nox constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. n = 0 validate 1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. store_fn arm-aarch64-separate aarch64 = st4 link-aarch64 = st4lane._EXTpi8_ const-aarch64 = LANE generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void arm = vst4 link-arm = vst4lane._EXTpi8r_ const-arm = LANE generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Dot product index form with signed and unsigned integers name = vsudot out-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_dot-LANE multi_fn = simd_shuffle-in_len-!, c:unsigned, c, c, {base-4-LANE} multi_fn = vsudot-outlane-_, a, b, c a = 1, 2, 1, 2 b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 n = 0 validate 31, 72, 31, 72 target = dotprod aarch64 = sudot link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t // LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot //generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t // LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot //generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t /// Multiply name = vmul a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 arm = vmul. aarch64 = mul fn = simd_mul generate int*_t, uint*_t /// Polynomial multiply name = vmul a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48 aarch64 = pmul link-aarch64 = pmul._EXT_ arm = vmul link-arm = vmulp._EXT_ generate poly8x8_t, poly8x16_t /// Multiply name = vmul fn = simd_mul a = 1.0, 2.0, 1.0, 2.0 b = 2.0, 3.0, 4.0, 5.0 validate 2.0, 6.0, 4.0, 10.0 aarch64 = fmul generate float64x*_t arm = vmul. generate float*_t /// Vector multiply by scalar name = vmul out-n-suffix multi_fn = simd_mul, a, {vdup-nout-noext, b} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2 validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 arm = vmul aarch64 = mul generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t /// Vector multiply by scalar name = vmul out-n-suffix multi_fn = simd_mul, a, {vdup-nout-noext, b} a = 1., 2., 3., 4. b = 2. validate 2., 4., 6., 8. aarch64 = fmul generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t arm = vmul generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t /// Multiply name = vmul lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 aarch64 = mul arm = vmul generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t /// Floating-point multiply name = vmul lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_mul, a, {transmute--, {simd_extract, b, LANE as u32}} a = 1., 2., 3., 4. b = 2., 0., 0., 0. n = 0 validate 2., 4., 6., 8. aarch64 = fmul generate float64x1_t, float64x1_t:float64x2_t:float64x1_t /// Floating-point multiply name = vmul lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} a = 1., 2., 3., 4. b = 2., 0., 0., 0. n = 0 validate 2., 4., 6., 8. aarch64 = fmul generate float64x2_t:float64x1_t:float64x2_t, float64x2_t arm = vmul generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Floating-point multiply name = vmuls_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_extract, b:f32, b, LANE as u32 multi_fn = a * b a = 1. b = 2., 0., 0., 0. n = 0 validate 2. aarch64 = fmul generate f32:float32x2_t:f32, f32:float32x4_t:f32 /// Floating-point multiply name = vmuld_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_extract, b:f64, b, LANE as u32 multi_fn = a * b a = 1. b = 2., 0. n = 0 validate 2. aarch64 = fmul generate f64:float64x1_t:f64, f64:float64x2_t:f64 /// Signed multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 arm = vmull.s aarch64 = smull link-arm = vmulls._EXT_ link-aarch64 = smull._EXT_ generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Signed multiply long name = vmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = vmull-noqself-noext, a, b a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 9, 20, 11, 24, 13, 28, 15, 32 aarch64 = smull2 generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t /// Unsigned multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8 b = 1, 2, 1, 2, 1, 2, 1, 2 validate 1, 4, 3, 8, 5, 12, 7, 16 arm = vmull.s aarch64 = umull link-arm = vmullu._EXT_ link-aarch64 = umull._EXT_ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t /// Unsigned multiply long name = vmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = vmull-noqself-noext, a, b a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 9, 20, 11, 24, 13, 28, 15, 32 aarch64 = umull2 generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t /// Polynomial multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8 b = 1, 3, 1, 3, 1, 3, 1, 3 validate 1, 6, 3, 12, 5, 10, 7, 24 arm = vmull.s aarch64 = pmull link-arm = vmullp._EXT_ link-aarch64 = pmull._EXT_ generate poly8x8_t:poly8x8_t:poly16x8_t /// Polynomial multiply long name = vmull no-q a = 15 b = 3 validate 17 target = aes aarch64 = pmull link-aarch64 = pmull64:p64:p64:p64:int8x16_t // Because of the support status of llvm, vmull_p64 is currently only available on arm // arm = vmull // link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t generate p64:p64:p128 /// Polynomial multiply long name = vmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = vmull-noqself-noext, a, b a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 9, 30, 11, 20, 13, 18, 15, 48 aarch64 = pmull generate poly8x16_t:poly8x16_t:poly16x8_t /// Polynomial multiply long name = vmull_high no-q multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1} a = 1, 15 b = 1, 3 validate 17 target = aes aarch64 = pmull generate poly64x2_t:poly64x2_t:p128 /// Vector long multiply with scalar name = vmull_n no-q multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} a = 1, 2, 3, 4, 5, 6, 7, 8 b = 2 validate 2, 4, 6, 8, 10, 12, 14, 16 arm = vmull aarch64 = smull generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t aarch64 = umull generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t /// Vector long multiply by scalar name = vmull_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 arm = vmull aarch64 = smull generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t aarch64 = umull generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t /// Multiply long name = vmull_high_n no-q multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b} a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 2 validate 18, 20, 22, 24, 26, 28, 30, 32 aarch64 = smull2 generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t aarch64 = umull2 generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t /// Multiply long name = vmull_high_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 18, 20, 22, 24, 26, 28, 30, 32 aarch64 = smull2 generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t aarch64 = umull2 generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t /// Floating-point multiply extended name = vmulx a = 1., 2., 3., 4. b = 2., 2., 2., 2. validate 2., 4., 6., 8. aarch64 = fmulx link-aarch64 = fmulx._EXT_ generate float*_t, float64x*_t /// Floating-point multiply extended name = vmulx lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmulx-in0-noext, a, {transmute--, {simd_extract, b, LANE as u32}} a = 1. b = 2., 0. n = 0 validate 2. aarch64 = fmulx generate float64x1_t, float64x1_t:float64x2_t:float64x1_t /// Floating-point multiply extended name = vmulx lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} a = 1., 2., 3., 4. b = 2., 0., 0., 0. n = 0 validate 2., 4., 6., 8. aarch64 = fmulx generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t generate float64x2_t:float64x1_t:float64x2_t, float64x2_t /// Floating-point multiply extended name = vmulx a = 2. b = 3. validate 6. aarch64 = fmulx link-aarch64 = fmulx._EXT_ generate f32, f64 /// Floating-point multiply extended name = vmulx lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32} a = 2. b = 3., 0., 0., 0. n = 0 validate 6. aarch64 = fmulx generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64 /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma multi_fn = vfma-self-_, b, c, a a = 8.0, 18.0, 12.0, 10.0 b = 6.0, 4.0, 7.0, 8.0 c = 2.0, 3.0, 4.0, 5.0 validate 20.0, 30.0, 40.0, 50.0 link-aarch64 = llvm.fma._EXT_ aarch64 = fmadd generate float64x1_t aarch64 = fmla generate float64x2_t target = vfp4 arm = vfma link-arm = llvm.fma._EXT_ generate float*_t /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma n-suffix multi_fn = vfma-self-noext, a, b, {vdup-nselfvfp4-noext, c} a = 2.0, 3.0, 4.0, 5.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 validate 50.0, 35.0, 60.0, 69.0 aarch64 = fmadd generate float64x1_t:float64x1_t:f64:float64x1_t aarch64 = fmla generate float64x2_t:float64x2_t:f64:float64x2_t target = vfp4 arm = vfma generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Floating-point fused multiply-add to accumulator name = vfma in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} a = 2., 3., 4., 5. b = 6., 4., 7., 8. c = 2., 0., 0., 0. n = 0 validate 14., 11., 18., 21. aarch64 = fmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t aarch64 = fmadd generate float64x1_t aarch64 = fmla generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t /// Floating-point fused multiply-add to accumulator name = vfma in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = simd_extract, c:out_t, c, LANE as u32 multi_fn = vfma-in2lane-_, b, c, a a = 2. b = 6. c = 3., 0., 0., 0. n = 0 validate 20. aarch64 = fmla link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32 generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64 aarch64 = fmadd generate f64:f64:float64x1_t:f64 aarch64 = fmla generate f64:f64:float64x2_t:f64 /// Floating-point fused multiply-subtract from accumulator name = vfms multi_fn = simd_neg, b:in_t, b multi_fn = vfma-self-noext, a, b, c a = 20.0, 30.0, 40.0, 50.0 b = 6.0, 4.0, 7.0, 8.0 c = 2.0, 3.0, 4.0, 5.0 validate 8.0, 18.0, 12.0, 10.0 aarch64 = fmsub generate float64x1_t aarch64 = fmls generate float64x2_t target = vfp4 arm = vfms generate float*_t /// Floating-point fused Multiply-subtract to accumulator(vector) name = vfms n-suffix multi_fn = vfms-self-noext, a, b, {vdup-nselfvfp4-noext, c} a = 50.0, 35.0, 60.0, 69.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 validate 2.0, 3.0, 4.0, 5.0 aarch64 = fmsub generate float64x1_t:float64x1_t:f64:float64x1_t aarch64 = fmls generate float64x2_t:float64x2_t:f64:float64x2_t target = vfp4 arm = vfms generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Floating-point fused multiply-subtract to accumulator name = vfms in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} a = 14., 11., 18., 21. b = 6., 4., 7., 8. c = 2., 0., 0., 0. n = 0 validate 2., 3., 4., 5. aarch64 = fmls generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t aarch64 = fmsub generate float64x1_t aarch64 = fmls generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t /// Floating-point fused multiply-subtract to accumulator name = vfms in2-lane-suffixes constn = LANE multi_fn = vfma-in2lane-::, a, -b, c a = 14. b = 6. c = 2., 0., 0., 0. n = 0 validate 2. aarch64 = fmls generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 aarch64 = fmsub generate f64:f64:float64x1_t:f64 aarch64 = fmls generate f64:f64:float64x2_t:f64 /// Divide name = vdiv fn = simd_div a = 2.0, 6.0, 4.0, 10.0 b = 1.0, 2.0, 1.0, 2.0 validate 2.0, 3.0, 4.0, 5.0 aarch64 = fdiv generate float*_t, float64x*_t /// Subtract name = vsub a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 arm = vsub. aarch64 = sub fn = simd_sub generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Subtract name = vsub fn = simd_sub a = 1.0, 4.0, 3.0, 8.0 b = 1.0, 2.0, 3.0, 4.0 validate 0.0, 2.0, 0.0, 4.0 aarch64 = fsub generate float64x*_t arm = vsub. generate float*_t /// Subtract name = vsub multi_fn = a.wrapping_sub(b) a = 3 b = 2 validate 1 aarch64 = nop generate i64, u64 /// Add name = vadd multi_fn = a.wrapping_add(b) a = 1 b = 2 validate 3 aarch64 = nop generate i64, u64 /// Bitwise exclusive OR name = vadd multi_fn = simd_xor, a, b a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 validate 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17 aarch64 = nop arm = nop generate poly8x8_t, poly16x4_t, poly8x16_t, poly16x8_t, poly64x1_t, poly64x2_t /// Bitwise exclusive OR name = vaddq no-q multi_fn = a ^ b a = 16 b = 1 validate 17 aarch64 = nop arm = nop generate p128 /// Floating-point add across vector name = vaddv a = 1., 2., 0., 0. validate 3. aarch64 = faddp link-aarch64 = faddv._EXT2_._EXT_ generate float32x2_t:f32, float32x4_t:f32, float64x2_t:f64 /// Signed Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = saddlv link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ generate int16x4_t:i32 /// Signed Add Long across Vector name = vaddlv a = 1, 2, 3, 4, 5, 6, 7, 8 validate 36 aarch64 = saddlv link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ generate int16x8_t:i32 /// Signed Add Long across Vector name = vaddlv a = 1, 2 validate 3 aarch64 = saddlp link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ generate int32x2_t:i64 /// Signed Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = saddlv link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ generate int32x4_t:i64 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = uaddlv link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ generate uint16x4_t:u32 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2, 3, 4, 5, 6, 7, 8 validate 36 aarch64 = uaddlv link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ generate uint16x8_t:u32 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2 validate 3 aarch64 = uaddlp link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ generate uint32x2_t:u64 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = uaddlv link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ generate uint32x4_t:u64 /// Subtract returning high narrow name = vsubhn no-q multi_fn = fixed, c:in_t multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)} a = MAX, MIN, 1, 1, MAX, MIN, 1, 1 b = 1, 0, 0, 0, 1, 0, 0, 0 fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS validate MAX, MIN, 0, 0, MAX, MIN, 0, 0 arm = vsubhn aarch64 = subhn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Subtract returning high narrow name = vsubhn_high no-q multi_fn = vsubhn-noqself-noext, d:in_t0, b, c multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len} a = MAX, 0, MAX, 0, MAX, 0, MAX, 0 b = MAX, 1, MAX, 1, MAX, 1, MAX, 1 c = 1, 0, 1, 0, 1, 0, 1, 0 validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0 arm = vsubhn aarch64 = subhn2 generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t /// Signed halving subtract name = vhsub a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 arm = vhsub.s aarch64 = uhsub link-arm = vhsubu._EXT_ link-aarch64 = uhsub._EXT_ generate uint*_t arm = vhsub.s aarch64 = shsub link-arm = vhsubs._EXT_ link-aarch64 = shsub._EXT_ generate int*_t /// Signed Subtract Wide name = vsubw no-q multi_fn = simd_sub, a, {simd_cast, b} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubw aarch64 = ssubw generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t /// Unsigned Subtract Wide name = vsubw no-q multi_fn = simd_sub, a, {simd_cast, b} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubw aarch64 = usubw generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t /// Signed Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = ssubw generate int16x8_t:int8x16_t:int16x8_t /// Signed Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 11 b = 0, 1, 2, 3, 8, 9, 10, 11 validate 0, 0, 0, 0 aarch64 = ssubw generate int32x4_t:int16x8_t:int32x4_t /// Signed Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9 b = 6, 7, 8, 9 validate 0, 0 aarch64 = ssubw generate int64x2_t:int32x4_t:int64x2_t /// Unsigned Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 11, 12, 13, 14, 15 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = usubw generate uint16x8_t:uint8x16_t:uint16x8_t /// Unsigned Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 11 b = 0, 1, 2, 3, 8, 9, 10, 11 validate 0, 0, 0, 0 aarch64 = usubw generate uint32x4_t:uint16x8_t:uint32x4_t /// Unsigned Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9 b = 6, 7, 8, 9 validate 0, 0 aarch64 = usubw generate uint64x2_t:uint32x4_t:uint64x2_t /// Signed Subtract Long name = vsubl no-q multi_fn = simd_cast, c:out_t, a multi_fn = simd_cast, d:out_t, b multi_fn = simd_sub, c, d a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubl aarch64 = ssubl generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Unsigned Subtract Long name = vsubl no-q multi_fn = simd_cast, c:out_t, a multi_fn = simd_cast, d:out_t, b multi_fn = simd_sub, c, d a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubl aarch64 = usubl generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t /// Signed Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 validate 6, 7, 8, 9, 10, 11, 12, 13 aarch64 = ssubl generate int8x16_t:int8x16_t:int16x8_t /// Signed Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 8, 9, 10, 11, 12, 13, 14, 15 b = 6, 6, 6, 6, 8, 8, 8, 8 validate 4, 5, 6, 7 aarch64 = ssubl generate int16x8_t:int16x8_t:int32x4_t /// Signed Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 12, 13, 14, 15 b = 6, 6, 8, 8 validate 6, 7 aarch64 = ssubl generate int32x4_t:int32x4_t:int64x2_t /// Unsigned Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 validate 6, 7, 8, 9, 10, 11, 12, 13 aarch64 = usubl generate uint8x16_t:uint8x16_t:uint16x8_t /// Unsigned Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 8, 9, 10, 11, 12, 13, 14, 15 b = 6, 6, 6, 6, 8, 8, 8, 8 validate 4, 5, 6, 7 aarch64 = usubl generate uint16x8_t:uint16x8_t:uint32x4_t /// Unsigned Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 12, 13, 14, 15 b = 6, 6, 8, 8 validate 6, 7 aarch64 = usubl generate uint32x4_t:uint32x4_t:uint64x2_t /// Bit clear and exclusive OR name = vbcax a = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 validate 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 target = sha3 aarch64 = bcax link-aarch64 = llvm.aarch64.crypto.bcaxs._EXT_ generate int8x16_t, int16x8_t, int32x4_t, int64x2_t link-aarch64 = llvm.aarch64.crypto.bcaxu._EXT_ generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t /// Floating-point complex add name = vcadd_rot270 no-q a = 1., -1., 1., -1. b = -1., 1., -1., 1. validate 2., 0., 2., 0. target = fcma aarch64 = fcadd link-aarch64 = vcadd.rot270._EXT_ generate float32x2_t name = vcaddq_rot270 generate float32x4_t, float64x2_t /// Floating-point complex add name = vcadd_rot90 no-q a = 1., -1., 1., -1. b = -1., 1., -1., 1. validate 0., -2., 0., -2. target = fcma aarch64 = fcadd link-aarch64 = vcadd.rot90._EXT_ generate float32x2_t name = vcaddq_rot90 generate float32x4_t, float64x2_t /// Floating-point complex multiply accumulate name = vcmla a = 1., -1., 1., -1. b = -1., 1., -1., 1. c = 1., 1., -1., -1. validate 0., -2., 2., 0. target = fcma aarch64 = fcmla link-aarch64 = vcmla.rot0._EXT_ generate float32x2_t, float32x4_t, float64x2_t /// Floating-point complex multiply accumulate name = vcmla_rot90 rot-suffix a = 1., 1., 1., 1. b = 1., -1., 1., -1. c = 1., 1., 1., 1. validate 2., 0., 2., 0. target = fcma aarch64 = fcmla link-aarch64 = vcmla.rot90._EXT_ generate float32x2_t, float32x4_t, float64x2_t /// Floating-point complex multiply accumulate name = vcmla_rot180 rot-suffix a = 1., 1., 1., 1. b = 1., -1., 1., -1. c = 1., 1., 1., 1. validate 0., 0., 0., 0. target = fcma aarch64 = fcmla link-aarch64 = vcmla.rot180._EXT_ generate float32x2_t, float32x4_t, float64x2_t /// Floating-point complex multiply accumulate name = vcmla_rot270 rot-suffix a = 1., 1., 1., 1. b = 1., -1., 1., -1. c = 1., 1., 1., 1. validate 0., 2., 0., 2. target = fcma aarch64 = fcmla link-aarch64 = vcmla.rot270._EXT_ generate float32x2_t, float32x4_t, float64x2_t /// Floating-point complex multiply accumulate name = vcmla in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_rot-LANE multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} multi_fn = vcmla-self-noext, a, b, c a = 1., -1., 1., -1. b = -1., 1., -1., 1. c = 1., 1., -1., -1. n = 0 validate 0., -2., 0., -2. target = fcma aarch64 = fcmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Floating-point complex multiply accumulate name = vcmla_rot90 rot-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_rot-LANE multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} multi_fn = vcmla_rot90-rot-noext, a, b, c a = 1., -1., 1., -1. b = -1., 1., -1., 1. c = 1., 1., -1., -1. n = 0 validate 0., 0., 0., 0. target = fcma aarch64 = fcmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Floating-point complex multiply accumulate name = vcmla_rot180 rot-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_rot-LANE multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} multi_fn = vcmla_rot180-rot-noext, a, b, c a = 1., -1., 1., -1. b = -1., 1., -1., 1. c = 1., 1., -1., -1. n = 0 validate 2., 0., 2., 0. target = fcma aarch64 = fcmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Floating-point complex multiply accumulate name = vcmla_rot270 rot-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_rot-LANE multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE} multi_fn = vcmla_rot270-rot-noext, a, b, c a = 1., -1., 1., -1. b = -1., 1., -1., 1. c = 1., 1., -1., -1. n = 0 validate 2., -2., 2., -2. target = fcma aarch64 = fcmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Dot product arithmetic name = vdot out-suffix a = 1, 2, 1, 2 b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 validate 31, 176, 31, 176 target = dotprod aarch64 = sdot link-aarch64 = sdot._EXT_._EXT3_ generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t aarch64 = udot link-aarch64 = udot._EXT_._EXT3_ generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t /// Dot product arithmetic name = vdot out-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_dot-LANE multi_fn = simd_shuffle-in_len-!, c:in_t, c, c, {base-4-LANE} multi_fn = vdot-out-noext, a, b, c a = 1, 2, 1, 2 b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 n = 0 validate 31, 72, 31, 72 target = dotprod aarch64 = sdot generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t aarch64 = udot generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t /// Maximum (vector) name = vmax a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16 arm = vmax aarch64 = smax link-arm = vmaxs._EXT_ link-aarch64 = smax._EXT_ generate int*_t arm = vmax aarch64 = umax link-arm = vmaxu._EXT_ link-aarch64 = umax._EXT_ generate uint*_t /// Maximum (vector) name = vmax a = 1.0, -2.0, 3.0, -4.0 b = 0.0, 3.0, 2.0, 8.0 validate 1.0, 3.0, 3.0, 8.0 aarch64 = fmax link-aarch64 = fmax._EXT_ generate float64x*_t arm = vmax aarch64 = fmax link-arm = vmaxs._EXT_ link-aarch64 = fmax._EXT_ generate float*_t /// Floating-point Maximum Number (vector) name = vmaxnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 8.0, 16.0, 3.0, 6.0 aarch64 = fmaxnm link-aarch64 = fmaxnm._EXT_ generate float64x*_t target = fp-armv8 arm = vmaxnm aarch64 = fmaxnm link-arm = vmaxnm._EXT_ link-aarch64 = fmaxnm._EXT_ generate float*_t /// Floating-point maximum number across vector name = vmaxnmv a = 1., 2., 0., 1. validate 2. aarch64 = fmaxnmp link-aarch64 = fmaxnmv._EXT2_._EXT_ generate float32x2_t:f32, float64x2_t:f64 aarch64 = fmaxnmv generate float32x4_t:f32 /// Floating-point Maximum Number Pairwise (vector). name = vpmaxnm a = 1.0, 2.0 b = 6.0, -3.0 validate 2.0, 6.0 aarch64 = fmaxnmp link-aarch64 = fmaxnmp._EXT_ generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t /// Floating-point Maximum Number Pairwise (vector). name = vpmaxnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 2.0, 3.0, 16.0, 6.0 aarch64 = fmaxnmp link-aarch64 = fmaxnmp._EXT_ generate float32x4_t:float32x4_t:float32x4_t /// Floating-point maximum number pairwise name = vpmaxnm out-suffix a = 1., 2. validate 2. aarch64 = fmaxnmp link-aarch64 = fmaxnmv._EXT2_._EXT_ generate float32x2_t:f32 name = vpmaxnmq generate float64x2_t:f64 /// Floating-point maximum pairwise name = vpmax out-suffix a = 1., 2. validate 2. aarch64 = fmaxp link-aarch64 = fmaxv._EXT2_._EXT_ generate float32x2_t:f32 name = vpmaxq generate float64x2_t:f64 /// Minimum (vector) name = vmin a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 arm = vmin aarch64 = smin link-arm = vmins._EXT_ link-aarch64 = smin._EXT_ generate int*_t arm = vmin aarch64 = umin link-arm = vminu._EXT_ link-aarch64 = umin._EXT_ generate uint*_t /// Minimum (vector) name = vmin a = 1.0, -2.0, 3.0, -4.0 b = 0.0, 3.0, 2.0, 8.0 validate 0.0, -2.0, 2.0, -4.0 aarch64 = fmin link-aarch64 = fmin._EXT_ generate float64x*_t arm = vmin aarch64 = fmin link-arm = vmins._EXT_ link-aarch64 = fmin._EXT_ generate float*_t /// Floating-point Minimum Number (vector) name = vminnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 1.0, 2.0, -1.0, -4.0 aarch64 = fminnm link-aarch64 = fminnm._EXT_ generate float64x*_t target = fp-armv8 arm = vminnm aarch64 = fminnm link-arm = vminnm._EXT_ link-aarch64 = fminnm._EXT_ generate float*_t /// Floating-point minimum number across vector name = vminnmv a = 1., 0., 2., 3. validate 0. aarch64 = fminnmp link-aarch64 = fminnmv._EXT2_._EXT_ generate float32x2_t:f32, float64x2_t:f64 aarch64 = fminnmv generate float32x4_t:f32 /// Vector move name = vmovl_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen} multi_fn = vmovl-noqself-noext, a a = 1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10 validate 3, 4, 5, 6, 7, 8, 9, 10 aarch64 = sxtl2 generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t aarch64 = uxtl2 generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t /// Floating-point add pairwise name = vpadd a = 1., 2., 3., 4. b = 3., 4., 5., 6. validate 3., 7., 7., 11. aarch64 = faddp link-aarch64 = faddp._EXT_ generate float32x4_t, float64x2_t arm = vpadd link-arm = vpadd._EXT_ generate float32x2_t /// Floating-point add pairwise name = vpadd out-suffix multi_fn = simd_extract, a1:out_t, a, 0 multi_fn = simd_extract, a2:out_t, a, 1 multi_fn = a1 + a2 a = 1., 2. validate 3. aarch64 = nop generate float32x2_t:f32, float64x2_t:f64 /// Floating-point Minimum Number Pairwise (vector). name = vpminnm a = 1.0, 2.0 b = 6.0, -3.0 validate 1.0, -3.0 aarch64 = fminnmp link-aarch64 = fminnmp._EXT_ generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t /// Floating-point Minimum Number Pairwise (vector). name = vpminnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 1.0, -4.0, 8.0, -1.0 aarch64 = fminnmp link-aarch64 = fminnmp._EXT_ generate float32x4_t:float32x4_t:float32x4_t /// Floating-point minimum number pairwise name = vpminnm out-suffix a = 1., 2. validate 1. aarch64 = fminnmp link-aarch64 = fminnmv._EXT2_._EXT_ generate float32x2_t:f32 name = vpminnmq generate float64x2_t:f64 /// Floating-point minimum pairwise name = vpmin out-suffix a = 1., 2. validate 1. aarch64 = fminp link-aarch64 = fminv._EXT2_._EXT_ generate float32x2_t:f32 name = vpminq generate float64x2_t:f64 /// Signed saturating doubling multiply long name = vqdmull a = 0, 1, 2, 3, 4, 5, 6, 7 b = 1, 2, 3, 4, 5, 6, 7, 8 validate 0, 4, 12, 24, 40, 60, 84, 108 aarch64 = sqdmull link-aarch64 = sqdmull._EXT2_ arm = vqdmull link-arm = vqdmull._EXT2_ generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmull multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0 a = 2 b = 3 validate 12 aarch64 = sqdmull generate i16:i16:i32 /// Signed saturating doubling multiply long name = vqdmull a = 2 b = 3 validate 12 aarch64 = sqdmull link-aarch64 = sqdmulls.scalar generate i32:i32:i64 /// Vector saturating doubling long multiply with scalar name = vqdmull_n no-q multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b} a = 2, 4, 6, 8 b = 2 validate 8, 16, 24, 32 aarch64 = sqdmull arm = vqdmull generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t /// Signed saturating doubling multiply long name = vqdmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen} multi_fn = vqdmull-noqself-noext, a, b a = 0, 1, 4, 5, 4, 5, 6, 7 b = 1, 2, 5, 6, 5, 6, 7, 8 validate 40, 60, 84, 112 aarch64 = sqdmull2 generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmull_high_n no-q multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len} multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = vqdmull-in_ntt-noext, a, b a = 0, 2, 8, 10, 8, 10, 12, 14 b = 2 validate 32, 40, 48, 56 aarch64 = sqdmull2 generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t /// Vector saturating doubling long multiply by scalar name = vqdmull_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32} multi_fn = vqdmull-noqself-noext, a, b a = 1, 2, 3, 4 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 4, 8, 12, 16 aarch64 = sqdmull generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t arm = vqdmull generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmullh_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmullh-noqself-noext, a, b a = 2 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 8 aarch64 = sqdmull generate i16:int16x4_t:i32, i16:int16x8_t:i32 /// Signed saturating doubling multiply long name = vqdmulls_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmulls-noqself-noext, a, b a = 2 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 8 aarch64 = sqdmull generate i32:int32x2_t:i64, i32:int32x4_t:i64 /// Signed saturating doubling multiply long name = vqdmull_high_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len} multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32} multi_fn = vqdmull-self-noext, a, b a = 0, 1, 4, 5, 4, 5, 6, 7 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 16, 20, 24, 28 aarch64 = sqdmull2 generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmull_high_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32} multi_fn = vqdmull-noqself-noext, a, b a = 0, 1, 4, 5, 4, 5, 6, 7 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 16, 20, 24, 28 aarch64 = sqdmull2 generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c} a = 1, 1, 1, 1 b = 1, 2, 3, 4 c = 2, 2, 2, 2 validate 5, 9, 13, 17 aarch64 = sqdmlal arm = vqdmlal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Vector widening saturating doubling multiply accumulate with scalar name = vqdmlal n-suffix multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c} a = 1, 1, 1, 1 b = 1, 2, 3, 4 c = 2 validate 5, 9, 13, 17 aarch64 = sqdmlal arm = vqdmlal generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal_high no-q multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c} a = 1, 2, 3, 4 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 1, 2, 5, 6, 5, 6, 7, 8 validate 41, 62, 87, 116 aarch64 = sqdmlal2 generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal_high_n no-q multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} a = 1, 2, 3, 4 b = 0, 2, 8, 10, 8, 10, 12, 14 c = 2 validate 33, 42, 51, 60 aarch64 = sqdmlal2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t /// Vector widening saturating doubling multiply accumulate with scalar name = vqdmlal_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::, b, c} a = 1, 2, 3, 4 b = 1, 2, 3, 4 c = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 5, 10, 15, 20 aarch64 = sqdmlal generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t arm = vqdmlal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal_high_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::, b, c} a = 1, 2, 3, 4 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 17, 22, 27, 32 aarch64 = sqdmlal2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c} multi_fn = vqadd-out-noext, a, {simd_extract, x, 0} a = 1 b = 1 c = 2 validate 5 aarch64 = sqdmull generate i32:i16:i16:i32, i64:i32:i32:i64 /// Signed saturating doubling multiply-add long name = vqdmlalh_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqdmlal-self-noext, a, b, {simd_extract, c, LANE as u32} a = 1 b = 1 c = 2, 1, 1, 1, 1, 1, 1, 1 n = 0 validate 5 aarch64 = sqdmlal generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32 name = vqdmlals_lane aarch64 = sqdmull generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64 /// Signed saturating doubling multiply-subtract long name = vqdmlsl multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c} a = 3, 7, 11, 15 b = 1, 2, 3, 4 c = 2, 2, 2, 2 validate -1, -1, -1, -1 aarch64 = sqdmlsl arm = vqdmlsl generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Vector widening saturating doubling multiply subtract with scalar name = vqdmlsl n-suffix multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c} a = 3, 7, 11, 15 b = 1, 2, 3, 4 c = 2 validate -1, -1, -1, -1 aarch64 = sqdmlsl arm = vqdmlsl generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl_high no-q multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c} a = 39, 58, 81, 108 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 1, 2, 5, 6, 5, 6, 7, 8 validate -1, -2, -3, -4 aarch64 = sqdmlsl2 generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl_high_n no-q multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} a = 31, 38, 45, 52 b = 0, 2, 8, 10, 8, 10, 12, 14 c = 2 validate -1, -2, -3, -4 aarch64 = sqdmlsl2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t /// Vector widening saturating doubling multiply subtract with scalar name = vqdmlsl_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::, b, c} a = 3, 6, 9, 12 b = 1, 2, 3, 4 c = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate -1, -2, -3, -4 aarch64 = sqdmlsl generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t arm = vqdmlsl generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl_high_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::, b, c} a = 15, 18, 21, 24 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate -1, -2, -3, -4 aarch64 = sqdmlsl2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c} multi_fn = vqsub-out-noext, a, {simd_extract, x, 0} a = 10 b = 1 c = 2 validate 6 aarch64 = sqdmull generate i32:i16:i16:i32, i64:i32:i32:i64 /// Signed saturating doubling multiply-subtract long name = vqdmlslh_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqdmlsl-self-noext, a, b, {simd_extract, c, LANE as u32} a = 10 b = 1 c = 2, 1, 1, 1, 1, 1, 1, 1 n = 0 validate 6 aarch64 = sqdmlsl generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32 name = vqdmlsls_lane aarch64 = sqdmull generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64 /// Signed saturating doubling multiply returning high half name = vqdmulh a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2, 2, 2, 2, 2, 2, 2, 2 validate 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = sqdmulh link-aarch64 = sqdmulh._EXT_ arm = vqdmulh link-arm = vqdmulh._EXT_ generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating doubling multiply returning high half name = vqdmulh multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0 a = 1 b = 2 validate 0 aarch64 = sqdmulh generate i16, i32 /// Vector saturating doubling multiply high with scalar name = vqdmulh_n out-suffix multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = vqdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX b = 2 validate 1, 1, 1, 1 aarch64 = sqdmulh arm = vqdmulh generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t /// Vector saturating doubling multiply high with scalar name = vqdmulhq_n no-q multi_fn = vdupq_n-in_ntt-noext, b:out_t, b multi_fn = vqdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2 validate 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = sqdmulh arm = vqdmulh generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t /// Signed saturating doubling multiply returning high half name = vqdmulhh_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmulhh-out_ntt-noext, a, b a = 2 b = 0, 0, MAX, 0, 0, 0, 0, 0 n = 2 validate 1 aarch64 = sqdmulh generate i16:int16x4_t:i16, i16:int16x8_t:i16 /// Signed saturating doubling multiply returning high half name = vqdmulhs_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmulhs-out_ntt-noext, a, b a = 2 b = 0, MAX, 0, 0 n = 1 validate 1 aarch64 = sqdmulh generate i32:int32x2_t:i32, i32:int32x4_t:i32 /// Vector saturating doubling multiply high by scalar name = vqdmulh lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqdmulh-out-noext, a, {vdup-nout-noext, {simd_extract, b, LANE as u32}} a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2, 1, 1, 1, 1, 1, 1, 1 n = 0 validate 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = sqdmulh generate int16x4_t, int16x8_t:int16x4_t:int16x8_t generate int32x2_t, int32x4_t:int32x2_t:int32x4_t arm = vqdmulh generate int16x8_t, int16x4_t:int16x8_t:int16x4_t generate int32x4_t, int32x2_t:int32x4_t:int32x2_t /// Signed saturating extract narrow name = vqmovn no-q a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX aarch64 = sqxtn link-aarch64 = sqxtn._EXT2_ arm = vqmovn link-arm = vqmovns._EXT2_ generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Unsigned saturating extract narrow name = vqmovn no-q a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX aarch64 = uqxtn link-aarch64 = uqxtn._EXT2_ arm = vqmovn link-arm = vqmovnu._EXT2_ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Saturating extract narrow name = vqmovn multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 a = 1 validate 1 aarch64 = sqxtn generate i16:i8, i32:i16 aarch64 = uqxtn generate u16:u8, u32:u16 /// Saturating extract narrow name = vqmovn a = 1 validate 1 aarch64 = sqxtn link-aarch64 = scalar.sqxtn._EXT2_._EXT_ generate i64:i32 aarch64 = uqxtn link-aarch64 = scalar.uqxtn._EXT2_._EXT_ generate u64:u32 /// Signed saturating extract narrow name = vqmovn_high no-q multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len} a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX aarch64 = sqxtn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t aarch64 = uqxtn2 generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed saturating extract unsigned narrow name = vqmovun no-q a = -1, -1, -1, -1, -1, -1, -1, -1 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = sqxtun link-aarch64 = sqxtun._EXT2_ arm = vqmovun link-arm = vqmovnsu._EXT2_ generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t /// Signed saturating extract unsigned narrow name = vqmovun multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 a = 1 validate 1 aarch64 = sqxtun generate i16:u8, i32:u16, i64:u32 /// Signed saturating extract unsigned narrow name = vqmovun_high no-q multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len} a = 0, 0, 0, 0, 0, 0, 0, 0 b = -1, -1, -1, -1, -1, -1, -1, -1 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = sqxtun2 generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t /// Signed saturating rounding doubling multiply returning high half name = vqrdmulh a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2, 2, 2, 2, 2, 2, 2, 2 validate 2, 2, 2, 2, 2, 2, 2, 2 aarch64 = sqrdmulh link-aarch64 = sqrdmulh._EXT_ arm = vqrdmulh link-arm = vqrdmulh._EXT_ generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply returning high half name = vqrdmulh multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1 b = 2 validate 0 aarch64 = sqrdmulh generate i16, i32 /// Vector saturating rounding doubling multiply high with scalar name = vqrdmulh out-n-suffix multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b} a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2 validate 2, 2, 2, 2, 2, 2, 2, 2 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t /// Vector rounding saturating doubling multiply high by scalar name = vqrdmulh lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32} multi_fn = vqrdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 0, 2, 0, 0, 0, 0, 0, 0, n = 1 validate 2, 2, 2, 2, 2, 2, 2, 2 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t /// Signed saturating rounding doubling multiply returning high half name = vqrdmulh lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32} a = 1 b = 0, 2, 0, 0, 0, 0, 0, 0, n = 1 validate 0 aarch64 = sqrdmulh generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32 /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 2, 2, 2, 2, 2, 2, 2, 2 validate 3, 3, 3, 3, 3, 3, 3, 3 aarch64 = sqrdmlah link-aarch64 = sqrdmlah._EXT_ target = rdm generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c multi_fn = simd_extract, {vqrdmlah-in_ntt-noext, a, b, c}, 0 a = 1 b = 1 c = 2 validate 1 aarch64 = sqrdmlah target = rdm generate i16, i32 /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32} multi_fn = vqrdmlah-out-noext, a, b, c a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 3, 3, 3, 3, 3, 3, 3, 3 aarch64 = sqrdmlah target = rdm generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqrdmlah-self-noext, a, b, {simd_extract, c, LANE as u32} a = 1 b = 1 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 1 aarch64 = sqrdmlah target = rdm generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh link-aarch64 = sqrdmlsh._EXT_ a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 2, 2, 2, 2, 2, 2, 2, 2 validate -1, -1, -1, -1, -1, -1, -1, -1 aarch64 = sqrdmlsh target = rdm generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c multi_fn = simd_extract, {vqrdmlsh-in_ntt-noext, a, b, c}, 0 a = 1 b = 1 c = 2 validate 1 aarch64 = sqrdmlsh target = rdm generate i16, i32 /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32} multi_fn = vqrdmlsh-out-noext, a, b, c a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate -1, -1, -1, -1, -1, -1, -1, -1 aarch64 = sqrdmlsh target = rdm generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqrdmlsh-self-noext, a, b, {simd_extract, c, LANE as u32} a = 1 b = 1 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 1 aarch64 = sqrdmlsh target = rdm generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 /// Signed saturating rounding shift left name = vqrshl a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = sqrshl link-aarch64 = sqrshl._EXT_ generate i32, i64 arm = vqrshl link-arm = vqrshifts._EXT_ generate int*_t, int64x*_t /// Signed saturating rounding shift left name = vqrshl multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0 a = 1 b = 2 validate 4 aarch64 = sqrshl generate i8, i16 /// Unsigned signed saturating rounding shift left name = vqrshl out-suffix a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = uqrshl link-aarch64 = uqrshl._EXT_ generate u32:i32:u32, u64:i64:u64 arm = vqrshl link-arm = vqrshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Unsigned signed saturating rounding shift left name = vqrshl out-suffix multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0 a = 1 b = 2 validate 4 aarch64 = uqrshl generate u8:i8:u8, u16:i16:u16 /// Signed saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = MIN, 4, 8, 12, 16, 20, 24, 28 n = 2 validate MIN, 1, 2, 3, 4, 5, 6, 7 aarch64 = sqrshrn link-aarch64 = sqrshrn._EXT2_ const-aarch64 = N arm = vqrshrn link-arm = vqrshiftns._EXT2_ const-arm = -N as ttn arm-aarch64-separate generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Signed saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a multi_fn = simd_extract, {vqrshrn_n-in_ntt-::, a}, 0 a = 4 n = 2 validate 1 aarch64 = sqrshrn generate i16:i8, i32:i16, i64:i32 /// Signed saturating rounded shift right narrow name = vqrshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 6, 7 b = 8, 12, 24, 28, 48, 52, 56, 60 n = 2 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 aarch64 = sqrshrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t /// Unsigned signed saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = MIN, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 aarch64 = uqrshrn link-aarch64 = uqrshrn._EXT2_ const-aarch64 = N arm = vqrshrn link-arm = vqrshiftnu._EXT2_ const-arm = -N as ttn arm-aarch64-separate generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Unsigned saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a multi_fn = simd_extract, {vqrshrn_n-in_ntt-::, a}, 0 a = 4 n = 2 validate 1 aarch64 = uqrshrn generate u16:u8, u32:u16, u64:u32 /// Unsigned saturating rounded shift right narrow name = vqrshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 6, 7 b = 8, 12, 24, 28, 48, 52, 56, 60 n = 2 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 aarch64 = uqrshrn2 generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed saturating rounded shift right unsigned narrow name = vqrshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 aarch64 = sqrshrun link-aarch64 = sqrshrun._EXT2_ const-aarch64 = N arm = vqrshrun link-arm = vqrshiftnsu._EXT2_ const-arm = -N as ttn arm-aarch64-separate generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t /// Signed saturating rounded shift right unsigned narrow name = vqrshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a multi_fn = simd_extract, {vqrshrun_n-in_ntt-::, a}, 0 a = 4 n = 2 validate 1 aarch64 = sqrshrun generate i16:u8, i32:u16, i64:u32 /// Signed saturating rounded shift right unsigned narrow name = vqrshrun_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 6, 7 b = 8, 12, 24, 28, 48, 52, 56, 60 n = 2 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 aarch64 = sqrshrun2 generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t /// Signed saturating shift left name = vqshl a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = sqshl link-aarch64 = sqshl._EXT_ generate i64 arm = vqshl link-arm = vqshifts._EXT_ generate int*_t, int64x*_t /// Signed saturating shift left name = vqshl multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b} multi_fn = simd_extract, c, 0 a = 1 b = 2 validate 4 aarch64 = sqshl generate i8, i16, i32 /// Unsigned saturating shift left name = vqshl out-suffix a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = uqshl link-aarch64 = uqshl._EXT_ generate u64:i64:u64 arm = vqshl link-arm = vqshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Unsigned saturating shift left name = vqshl out-suffix multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b} multi_fn = simd_extract, c, 0 a = 1 b = 2 validate 4 aarch64 = uqshl generate u8:i8:u8, u16:i16:u16, u32:i32:u32 /// Signed saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N as _} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 n = 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = sqshl arm = vqshl generate int*_t, int64x*_t /// Signed saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_extract, {vqshl_n-in_ntt-::, {vdup_n-in_ntt-noext, a}}, 0 a = 1 n = 2 validate 4 aarch64 = sqshl generate i8, i16, i32, i64 /// Unsigned saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N as _} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 n = 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = uqshl arm = vqshl generate uint*_t, uint64x*_t /// Unsigned saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_extract, {vqshl_n-in_ntt-::, {vdup_n-in_ntt-noext, a}}, 0 a = 1 n = 2 validate 4 aarch64 = uqshl generate u8, u16, u32, u64 /// Signed saturating shift left unsigned name = vqshlu n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 n = 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 arm-aarch64-separate aarch64 = sqshlu link-aarch64 = sqshlu._EXT_ const-aarch64 = {dup-in_len-N as ttn} arm = vqshlu link-arm = vqshiftsu._EXT_ const-arm = N as ttn generate int8x8_t:uint8x8_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t generate int8x16_t:uint8x16_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t /// Signed saturating shift left unsigned name = vqshlu n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_extract, {vqshlu_n-in_ntt-::, {vdup_n-in_ntt-noext, a}}, 0 a = 1 n = 2 validate 4 aarch64 = sqshlu generate i8:u8, i16:u16, i32:u32, i64:u64 /// Signed saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 arm-aarch64-separate aarch64 = sqshrn link-aarch64 = sqshrn._EXT2_ const-aarch64 = N generate i64:i32 arm = vqshrn link-arm = vqshiftns._EXT2_ const-arm = -N as ttn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Signed saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_extract, {vqshrn_n-in_ntt-::, {vdupq_n-in_ntt-noext, a}}, 0 a = 4 n = 2 validate 1 aarch64 = sqshrn generate i16:i8, i32:i16 /// Signed saturating shift right narrow name = vqshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = sqshrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t /// Unsigned saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 arm-aarch64-separate aarch64 = uqshrn link-aarch64 = uqshrn._EXT2_ const-aarch64 = N generate u64:u32 arm = vqshrn link-arm = vqshiftnu._EXT2_ const-arm = -N as ttn generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Unsigned saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_extract, {vqshrn_n-in_ntt-::, {vdupq_n-in_ntt-noext, a}}, 0 a = 4 n = 2 validate 1 aarch64 = uqshrn generate u16:u8, u32:u16 /// Unsigned saturating shift right narrow name = vqshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = uqshrn2 generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed saturating shift right unsigned narrow name = vqshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 arm-aarch64-separate aarch64 = sqshrun link-aarch64 = sqshrun._EXT2_ const-aarch64 = N arm = vqshrun link-arm = vqshiftnsu._EXT2_ const-arm = -N as ttn generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t /// Signed saturating shift right unsigned narrow name = vqshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_extract, {vqshrun_n-in_ntt-::, {vdupq_n-in_ntt-noext, a}}, 0 a = 4 n = 2 validate 1 aarch64 = sqshrun generate i16:u8, i32:u16, i64:u32 /// Signed saturating shift right unsigned narrow name = vqshrun_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = sqshrun2 generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t /// Unsigned saturating accumulate of signed value name = vsqadd out-suffix multi_fn = simd_extract, {vsqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 2 b = 2 validate 4 aarch64 = usqadd generate u8:i8:u8, u16:i16:u16 /// Unsigned saturating accumulate of signed value name = vsqadd out-suffix a = 2 b = 2 validate 4 aarch64 = usqadd link-aarch64 = usqadd._EXT_ generate u32:i32:u32, u64:i64:u64 /// Calculates the square root of each lane. name = vsqrt fn = simd_fsqrt a = 4.0, 9.0, 16.0, 25.0 validate 2.0, 3.0, 4.0, 5.0 aarch64 = fsqrt generate float*_t, float64x*_t /// Reciprocal square-root estimate. name = vrsqrte a = 1.0, 2.0, 3.0, 4.0 validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375 aarch64 = frsqrte link-aarch64 = frsqrte._EXT_ generate float64x*_t, f32, f64 arm = vrsqrte link-arm = vrsqrte._EXT_ generate float*_t /// Unsigned reciprocal square root estimate name = vrsqrte a = 1, 2, 3, 4 validate 4294967295, 4294967295, 4294967295, 4294967295 aarch64 = ursqrte link-aarch64 = ursqrte._EXT_ arm = vrsqrte link-arm = vrsqrte._EXT_ generate uint32x2_t, uint32x4_t /// Floating-point reciprocal square root step name = vrsqrts a = 1.0, 2.0, 3.0, 4.0 b = 1.0, 2.0, 3.0, 4.0 validate 1., -0.5, -3.0, -6.5 aarch64 = frsqrts link-aarch64 = frsqrts._EXT_ generate float64x*_t, f32, f64 arm = vrsqrts link-arm = vrsqrts._EXT_ generate float*_t /// Reciprocal estimate. name = vrecpe a = 4.0, 3.0, 2.0, 1.0 validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875 aarch64 = frecpe link-aarch64 = frecpe._EXT_ generate float64x*_t, f32, f64 arm = vrecpe link-arm = vrecpe._EXT_ generate float*_t /// Unsigned reciprocal estimate name = vrecpe a = 4, 3, 2, 1 validate 4294967295, 4294967295, 4294967295, 4294967295 aarch64 = urecpe link-aarch64 = urecpe._EXT_ arm = vrecpe link-arm = vrecpe._EXT_ generate uint32x2_t, uint32x4_t /// Floating-point reciprocal step name = vrecps a = 4.0, 3.0, 2.0, 1.0 b = 4.0, 3.0, 2.0, 1.0 validate -14., -7., -2., 1. aarch64 = frecps link-aarch64 = frecps._EXT_ generate float64x*_t, f32, f64 arm = vrecps link-arm = vrecps._EXT_ generate float*_t /// Floating-point reciprocal exponent name = vrecpx a = 4.0 validate 0.5 aarch64 = frecpx link-aarch64 = frecpx._EXT_ generate f32, f64 /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = nop generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t arm = nop generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1, 2, 3, 4, 5, 6, 7 validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 aarch64 = nop arm = nop generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t target = aes generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t generate p128:int64x2_t, p128:uint64x2_t, p128:poly64x2_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 validate 0, 1, 2, 3, 4, 5, 6, 7 aarch64 = nop arm = nop generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t target = aes generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t generate int64x2_t:p128, uint64x2_t:p128, poly64x2_t:p128 /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1, 2, 3 validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 aarch64 = nop arm = nop generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t target = aes generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t generate p128:int32x4_t, p128:uint32x4_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 validate 0, 1, 2, 3 aarch64 = nop arm = nop generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t target = aes generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t generate int32x4_t:p128, uint32x4_t:p128 /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1 validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 aarch64 = nop arm = nop generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t target = aes generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t generate p128:int16x8_t, p128:uint16x8_t, p128:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 validate 0, 1 aarch64 = nop arm = nop generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t target = aes generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t generate int16x8_t:p128, uint16x8_t:p128, poly16x8_t:p128 /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate 1 target = aes aarch64 = nop arm = nop generate int8x16_t:p128, uint8x16_t:p128, poly8x16_t:p128 /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 1 validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 target = aes aarch64 = nop arm = nop generate p128:int8x16_t, p128:uint8x16_t, p128:poly8x16_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0., 0., 0., 0., 0., 0., 0., 0. validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = nop generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t generate float64x2_t:p128 arm = nop generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t generate float32x4_t:p128 /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate 0., 0., 0., 0., 0., 0., 0., 0. aarch64 = nop generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t generate p128:float64x2_t arm = nop generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t generate p128:float32x4_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0., 0., 0., 0., 0., 0., 0., 0. validate 0., 0., 0., 0., 0., 0., 0., 0. aarch64 = nop generate float32x2_t:float64x1_t, float64x1_t:float32x2_t generate float32x4_t:float64x2_t, float64x2_t:float32x4_t /// Signed rounding shift left name = vrshl a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = srshl link-aarch64 = srshl._EXT_ generate i64 arm = vrshl link-arm = vrshifts._EXT_ generate int*_t, int64x*_t /// Unsigned rounding shift left name = vrshl out-suffix a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = urshl link-aarch64 = urshl._EXT_ generate u64:i64:u64 arm = vrshl link-arm = vrshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Signed rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = srshr arm = vrshr generate int*_t, int64x*_t /// Signed rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, -N as i64 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = srshr generate i64 /// Unsigned rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = urshr arm = vrshr generate uint*_t, uint64x*_t /// Unsigned rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, -N as i64 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = urshr generate u64 /// Rounding shift right narrow name = vrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 arm-aarch64-separate aarch64 = rshrn link-aarch64 = rshrn._EXT2_ const-aarch64 = N arm = vrshrn link-arm = vrshiftn._EXT2_ const-arm = -N as ttn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Rounding shift right narrow name = vrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = transmute, {vrshrn_n-noqsigned-::, transmute(a)} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = rshrn arm = vrshrn generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Rounding shift right narrow name = vrshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = rshrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed rounding shift right and accumulate name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vrshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = srsra arm = vrsra generate int*_t, int64x*_t /// Unsigned rounding shift right and accumulate name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vrshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = ursra arm = vrsra generate uint*_t, uint64x*_t /// Signed rounding shift right and accumulate. name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshr-nself-::, b:in_t, b multi_fn = a.wrapping_add(b) a = 1 b = 4 n = 2 validate 2 aarch64 = srsra generate i64 /// Ungisned rounding shift right and accumulate. name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshr-nself-::, b:in_t, b multi_fn = a.wrapping_add(b) a = 1 b = 4 n = 2 validate 2 aarch64 = ursra generate u64 /// Rounding subtract returning high narrow name = vrsubhn no-q a = MAX, MIN, 0, 4, 5, 6, 7, 8 b = 1, 2, 3, 4, 5, 6, 7, 8 validate MIN, MIN, 0, 0, 0, 0, 0, 0 aarch64 = rsubhn link-aarch64 = rsubhn._EXT2_ arm = vrsubhn link-arm = vrsubhn._EXT2_ generate int16x8_t:int16x8_t:int8x8_t, int32x4_t:int32x4_t:int16x4_t, int64x2_t:int64x2_t:int32x2_t /// Rounding subtract returning high narrow name = vrsubhn no-q multi_fn = transmute, {vrsubhn-noqsigned-noext, {transmute, a}, {transmute, b}} a = MAX, MIN, 3, 4, 5, 6, 7, 8 b = 1, 2, 3, 4, 5, 6, 7, 8 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = rsubhn arm = vrsubhn generate uint16x8_t:uint16x8_t:uint8x8_t, uint32x4_t:uint32x4_t:uint16x4_t, uint64x2_t:uint64x2_t:uint32x2_t /// Rounding subtract returning high narrow name = vrsubhn_high no-q multi_fn = vrsubhn-noqself-noext, x:in_t0, b, c multi_fn = simd_shuffle-out_len-!, a, x, {asc-0-out_len} a = 1, 2, 0, 0, 0, 0, 0, 0 b = 1, 2, 3, 4, 5, 6, 7, 8 c = 1, 2, 3, 4, 5, 6, 7, 8 validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = rsubhn2 generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t /// Insert vector element from another vector element name = vset_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 0 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = nop arm = nop generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t target = aes generate p64:poly64x1_t:poly64x1_t /// Insert vector element from another vector element name = vsetq_lane no-q constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 0 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = nop arm = nop generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t target = aes generate p64:poly64x2_t:poly64x2_t /// Insert vector element from another vector element name = vset_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1. b = 0., 2., 3., 4. n = 0 validate 1., 2., 3., 4. aarch64 = nop generate f64:float64x1_t:float64x1_t arm = nop generate f32:float32x2_t:float32x2_t /// Insert vector element from another vector element name = vsetq_lane no-q constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1. b = 0., 2., 3., 4. n = 0 validate 1., 2., 3., 4. aarch64 = nop generate f64:float64x2_t:float64x2_t arm = nop generate f32:float32x4_t:float32x4_t /// Signed Shift left name = vshl a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = sshl link-aarch64 = sshl._EXT_ arm = vshl link-arm = vshifts._EXT_ generate int*_t, int64x*_t /// Signed Shift left name = vshl multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)} a = 1 b = 2 validate 4 aarch64 = sshl generate i64 /// Unsigned Shift left name = vshl out-suffix a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = ushl link-aarch64 = ushl._EXT_ arm = vshl link-arm = vshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Unsigned Shift left out-suffix name = vshl multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)} a = 1 b = 2 validate 4 aarch64 = ushl generate u64:i64:u64 /// Shift left name = vshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_shl, a, {vdup-nself-noext, N as _} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 arm = vshl aarch64 = shl generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Signed shift left long name = vshll n-suffix constn = N multi_fn = static_assert-N-0-bits multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N as _} a = 1, 2, 3, 4, 5, 6, 7, 8 n = 2 validate 4, 8, 12, 16, 20, 24, 28, 32 arm = vshll.s aarch64 = sshll generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t aarch64 = ushll generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t /// Signed shift left long name = vshll_high_n no-q constn = N multi_fn = static_assert-N-0-bits multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen} multi_fn = vshll_n-noqself-::, b a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8 n = 2 validate 4, 8, 12, 16, 20, 24, 28, 32 aarch64 = sshll2 generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t aarch64 = ushll2 generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t /// Shift right name = vshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = fix_right_shift_imm-N-bits multi_fn = simd_shr, a, {vdup-nself-noext, n as _} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 arm = vshr.s aarch64 = sshr generate int*_t, int64x*_t aarch64 = ushr generate uint*_t, uint64x*_t /// Shift right narrow name = vshrn_n no-q constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N as _}} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 arm = vshrn. aarch64 = shrn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Shift right narrow name = vshrn_high_n no-q constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::, b}, {asc-0-out_len} a = 1, 2, 5, 6, 5, 6, 7, 8 b = 20, 24, 28, 32, 52, 56, 60, 64 n = 2 validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16 aarch64 = shrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed shift right and accumulate name = vsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = ssra arm = vsra generate int*_t, int64x*_t /// Unsigned shift right and accumulate name = vsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = usra arm = vsra generate uint*_t, uint64x*_t /// SM3PARTW1 name = vsm3partw1 a = 1, 2, 3, 4 b = 1, 2, 3, 4 c = 1, 2, 3, 4 validate 2147549312, 3221323968, 131329, 2684362752 target = sm4 aarch64 = sm3partw1 link-aarch64 = llvm.aarch64.crypto.sm3partw1 generate uint32x4_t /// SM3PARTW2 name = vsm3partw2 a = 1, 2, 3, 4 b = 1, 2, 3, 4 c = 1, 2, 3, 4 validate 128, 256, 384, 1077977696 target = sm4 aarch64 = sm3partw2 link-aarch64 = llvm.aarch64.crypto.sm3partw2 generate uint32x4_t /// SM3SS1 name = vsm3ss1 a = 1, 2, 3, 4 b = 1, 2, 3, 4 c = 1, 2, 3, 4 validate 0, 0, 0, 2098176 target = sm4 aarch64 = sm3ss1 link-aarch64 = llvm.aarch64.crypto.sm3ss1 generate uint32x4_t /// SM4 key name = vsm4ekey a = 1, 2, 3, 4 b = 1, 2, 3, 4 validate 1784948604, 136020997, 2940231695, 3789947679 target = sm4 aarch64 = sm4ekey link-aarch64 = llvm.aarch64.crypto.sm4ekey generate uint32x4_t /// SM4 encode name = vsm4e a = 1, 2, 3, 4 b = 1, 2, 3, 4 validate 1093874472, 3616769504, 3878330411, 2765298765 target = sm4 aarch64 = sm4e link-aarch64 = llvm.aarch64.crypto.sm4e generate uint32x4_t /// Rotate and exclusive OR name = vrax1 a = 1, 2 b = 3, 4 validate 7, 10 target = sha3 aarch64 = rax1 link-aarch64 = llvm.aarch64.crypto.rax1 generate uint64x2_t /// SHA512 hash update part 1 name = vsha512h a = 1, 2 b = 3, 4 c = 5, 6 validate 11189044327219203, 7177611956453380 target = sha3 aarch64 = sha512h link-aarch64 = llvm.aarch64.crypto.sha512h generate uint64x2_t /// SHA512 hash update part 2 name = vsha512h2 a = 1, 2 b = 3, 4 c = 5, 6 validate 5770237651009406214, 349133864969 target = sha3 aarch64 = sha512h2 link-aarch64 = llvm.aarch64.crypto.sha512h2 generate uint64x2_t /// SHA512 schedule update 0 name = vsha512su0 a = 1, 2 b = 3, 4 validate 144115188075855874, 9439544818968559619 target = sha3 aarch64 = sha512su0 link-aarch64 = llvm.aarch64.crypto.sha512su0 generate uint64x2_t /// SHA512 schedule update 1 name = vsha512su1 a = 1, 2 b = 3, 4 c = 5, 6 validate 105553116266526, 140737488355368 target = sha3 aarch64 = sha512su1 link-aarch64 = llvm.aarch64.crypto.sha512su1 generate uint64x2_t /// Floating-point round to 32-bit integer, using current rounding mode name = vrnd32x a = 1.1, 1.9, -1.7, -2.3 validate 1.0, 2.0, -2.0, -2.0 target = frintts aarch64 = frint32x link-aarch64 = frint32x._EXT_ generate float32x2_t, float32x4_t /// Floating-point round to 32-bit integer toward zero name = vrnd32z a = 1.1, 1.9, -1.7, -2.3 validate 1.0, 1.0, -1.0, -2.0 target = frintts aarch64 = frint32z link-aarch64 = frint32z._EXT_ generate float32x2_t, float32x4_t /// Floating-point round to 64-bit integer, using current rounding mode name = vrnd64x a = 1.1, 1.9, -1.7, -2.3 validate 1.0, 2.0, -2.0, -2.0 target = frintts aarch64 = frint64x link-aarch64 = frint64x._EXT_ generate float32x2_t, float32x4_t /// Floating-point round to 64-bit integer toward zero name = vrnd64z a = 1.1, 1.9, -1.7, -2.3 validate 1.0, 1.0, -1.0, -2.0 target = frintts aarch64 = frint64z link-aarch64 = frint64z._EXT_ generate float32x2_t, float32x4_t /// Transpose elements name = vtrn multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len} multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len} multi_fn = transmute, (a1, b1) a = 0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30 b = 1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 aarch64 = trn arm = vtrn generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t aarch64 = zip generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t /// Transpose elements name = vtrn multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len} multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len} multi_fn = transmute, (a1, b1) a = 0., 2., 2., 6. b = 1., 3., 3., 7. validate 0., 1., 2., 3., 2., 3., 6., 7. aarch64 = zip arm = vtrn generate float32x2_t:float32x2_t:float32x2x2_t aarch64 = trn generate float32x4_t:float32x4_t:float32x4x2_t /// Transpose vectors name = vtrn1 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 aarch64 = trn1 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip1 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Transpose vectors name = vtrn1 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} a = 0., 2., 4., 6., 8., 10., 12., 14. b = 1., 3., 5., 7., 9., 11., 13., 15. validate 0., 1., 4., 5., 8., 9., 12., 13. aarch64 = trn1 generate float32x4_t aarch64 = zip1 generate float32x2_t, float64x2_t /// Transpose vectors name = vtrn2 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 aarch64 = trn2 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip2 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Transpose vectors name = vtrn2 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} a = 0., 2., 4., 6., 8., 10., 12., 14. b = 1., 3., 5., 7., 9., 11., 13., 15. validate 2., 3., 6., 7., 10., 11., 14., 15. aarch64 = trn2 generate float32x4_t aarch64 = zip2 generate float32x2_t, float64x2_t /// Zip vectors name = vzip multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len} multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len} multi_fn = transmute, (a0, b0) a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 aarch64 = zip arm = vzip generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t arm = vtrn generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t aarch64 = ext arm = vorr generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t generate poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t /// Zip vectors name = vzip multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len} multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len} multi_fn = transmute, (a0, b0) a = 1., 2., 3., 4. b = 5., 6., 7., 8. validate 1., 5., 2., 6., 3., 7., 4., 8. aarch64 = zip arm = vtrn generate float32x2_t:float32x2_t:float32x2x2_t aarch64 = ext arm = vorr generate float32x4_t:float32x4_t:float32x4x2_t /// Zip vectors name = vzip1 multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = zip1 generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t /// Zip vectors name = vzip1 multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} a = 0., 2., 4., 6., 8., 10., 12., 14. b = 1., 3., 5., 7., 9., 11., 13., 15. validate 0., 1., 2., 3., 4., 5., 6., 7. aarch64 = zip1 generate float32x2_t, float32x4_t, float64x2_t /// Zip vectors name = vzip2 multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31 validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 aarch64 = zip2 generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t /// Zip vectors name = vzip2 multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} a = 0., 8., 8., 10., 8., 10., 12., 14. b = 1., 9., 9., 11., 9., 11., 13., 15. validate 8., 9., 10., 11., 12., 13., 14., 15. aarch64 = zip2 generate float32x2_t, float32x4_t, float64x2_t /// Unzip vectors name = vuzp multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len} multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len} multi_fn = transmute, (a0, b0) a = 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16 b = 2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32 validate 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32 aarch64 = uzp arm = vuzp generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t aarch64 = zip arm = vtrn generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t /// Unzip vectors name = vuzp multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len} multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len} multi_fn = transmute, (a0, b0) a = 1., 2., 2., 4. b = 2., 6., 6., 8. validate 1., 2., 2., 6., 2., 4., 6., 8. aarch64 = zip arm = vtrn generate float32x2_t:float32x2_t:float32x2x2_t aarch64 = uzp arm = vuzp generate float32x4_t:float32x4_t:float32x4x2_t /// Unzip vectors name = vuzp1 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0 b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0 validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16 aarch64 = uzp1 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip1 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Unzip vectors name = vuzp1 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} a = 0., 8., 1., 9., 4., 12., 5., 13. b = 1., 10., 3., 11., 6., 14., 7., 15. validate 0., 1., 1., 3., 4., 5., 6., 7. aarch64 = uzp1 generate float32x4_t aarch64 = zip1 generate float32x2_t, float64x2_t /// Unzip vectors name = vuzp2 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24 b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32 validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32 aarch64 = uzp2 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip2 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Unzip vectors name = vuzp2 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} a = 0., 8., 1., 9., 4., 12., 5., 13. b = 2., 9., 3., 11., 6., 14., 7., 15. validate 8., 9., 9., 11., 12., 13., 14., 15. aarch64 = uzp2 generate float32x4_t aarch64 = zip2 generate float32x2_t, float64x2_t //////////////////// // Unsigned Absolute difference and Accumulate Long //////////////////// /// Unsigned Absolute difference and Accumulate Long name = vabal multi_fn = vabd-unsigned-noext, b, c, d:in_t multi_fn = simd_add, a, {simd_cast, d} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = uabal generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t /// Unsigned Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = vabd_u8, d, e, f:uint8x8_t multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 20, 20, 20, 20, 20, 20, 20, 20 aarch64 = uabal generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t /// Unsigned Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7] multi_fn = vabd_u16, d, e, f:uint16x4_t multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12 b = 1, 2, 3, 4, 9, 10, 11, 12 c = 10, 10, 10, 10, 20, 0, 2, 4 validate 20, 20, 20, 20 aarch64 = uabal generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t /// Unsigned Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3] multi_fn = vabd_u32, d, e, f:uint32x2_t multi_fn = simd_add, a, {simd_cast, f} a = 15, 16 b = 1, 2, 15, 16 c = 10, 10, 10, 12 validate 20, 20 aarch64 = uabal generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t //////////////////// // Signed Absolute difference and Accumulate Long //////////////////// /// Signed Absolute difference and Accumulate Long name = vabal multi_fn = vabd-signed-noext, b, c, d:int8x8_t multi_fn = simd_cast, e:uint8x8_t, d multi_fn = simd_add, a, {simd_cast, e} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = sabal generate int16x8_t:int8x8_t:int8x8_t:int16x8_t /// Signed Absolute difference and Accumulate Long name = vabal multi_fn = vabd-signed-noext, b, c, d:int16x4_t multi_fn = simd_cast, e:uint16x4_t, d multi_fn = simd_add, a, {simd_cast, e} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = sabal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t /// Signed Absolute difference and Accumulate Long name = vabal multi_fn = vabd-signed-noext, b, c, d:int32x2_t multi_fn = simd_cast, e:uint32x2_t, d multi_fn = simd_add, a, {simd_cast, e} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = sabal generate int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Signed Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = vabd_s8, d, e, f:int8x8_t multi_fn = simd_cast, f:uint8x8_t, f multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 20, 20, 20, 20, 20, 20, 20, 20 aarch64 = sabal generate int16x8_t:int8x16_t:int8x16_t:int16x8_t /// Signed Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7] multi_fn = vabd_s16, d, e, f:int16x4_t multi_fn = simd_cast, f:uint16x4_t, f multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12 b = 1, 2, 3, 4, 9, 10, 11, 12 c = 10, 10, 10, 10, 20, 0, 2, 4 validate 20, 20, 20, 20 aarch64 = sabal generate int32x4_t:int16x8_t:int16x8_t:int32x4_t /// Signed Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3] multi_fn = vabd_s32, d, e, f:int32x2_t multi_fn = simd_cast, f:uint32x2_t, f multi_fn = simd_add, a, {simd_cast, f} a = 15, 16 b = 1, 2, 15, 16 c = 10, 10, 10, 12 validate 20, 20 aarch64 = sabal generate int64x2_t:int32x4_t:int32x4_t:int64x2_t //////////////////// // Singned saturating Absolute value //////////////////// /// Singned saturating Absolute value name = vqabs a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5 validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5 arm = vqabs.s aarch64 = sqabs link-arm = vqabs._EXT_ link-aarch64 = sqabs._EXT_ generate int*_t /// Singned saturating Absolute value name = vqabs a = MIN, -7 validate MAX, 7 aarch64 = sqabs link-aarch64 = sqabs._EXT_ generate int64x*_t /// Signed saturating absolute value name = vqabs multi_fn = simd_extract, {vqabs-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 a = -7 validate 7 aarch64 = sqabs generate i8:i8, i16:i16 /// Signed saturating absolute value name = vqabs a = -7 validate 7 aarch64 = sqabs link-aarch64 = sqabs._EXT_ generate i32:i32, i64:i64 /// Shift left and insert name = vsli n-suffix constn = N multi_fn = static_assert-N-0-63 multi_fn = transmute, {vsli_n-in_ntt-::, transmute(a), transmute(b)} a = 333 b = 2042 n = 2 validate 8169 aarch64 = sli generate i64, u64 /// Shift right and insert name = vsri n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = transmute, {vsri_n-in_ntt-::, transmute(a), transmute(b)} a = 333 b = 2042 n = 2 validate 510 aarch64 = sri generate i64, u64