22 files changed, 1348 insertions, 203 deletions
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
index 043f7ed51..0559aea83 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -12461,30 +12461,30 @@ mod tests {
     }
     #[simd_test(enable = "neon,i8mm")]
     unsafe fn test_vmmlaq_s32() {
-        let a: i32x4 = i32x4::new(1, 3, 4, 9);
-        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
-        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
-        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let a = i32x4::new(1, 3, 4, -0x10000);
+        let b = i8x16::new(1, 21, 31, 14, 5, 6, -128, 8, 9, 13, 15, 12, 13, -1, 20, 16);
+        let c = i8x16::new(12, 22, 3, 4, -1, 56, 7, 8, 91, 10, -128, 15, 13, 14, 17, 16);
+        let e = i32x4::new(123, -5353, 690, -65576);
         let r: i32x4 = transmute(vmmlaq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,i8mm")]
     unsafe fn test_vmmlaq_u32() {
-        let a: u32x4 = u32x4::new(1, 3, 4, 9);
-        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
-        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let a = u32x4::new(1, 3, 4, 0xffff0000);
+        let b = u8x16::new(1, 21, 31, 14, 5, 6, 128, 8, 9, 13, 15, 12, 13, 255, 20, 16);
+        let c = u8x16::new(12, 22, 3, 4, 255, 56, 7, 8, 91, 10, 128, 15, 13, 14, 17, 16);
+        let e = u32x4::new(3195, 6935, 18354, 4294909144);
         let r: u32x4 = transmute(vmmlaq_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,i8mm")]
     unsafe fn test_vusmmlaq_s32() {
-        let a: i32x4 = i32x4::new(1, 3, 4, 9);
-        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
-        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
-        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let a = i32x4::new(1, 3, 4, -0x10000);
+        let b = u8x16::new(1, 21, 31, 14, 5, 6, 128, 8, 9, 13, 15, 12, 13, 255, 20, 16);
+        let c = i8x16::new(12, 22, 3, 4, -1, 56, 7, 8, 91, 10, -128, 15, 13, 14, 17, 16);
+        let e = i32x4::new(1915, -1001, 15026, -61992);
         let r: i32x4 = transmute(vusmmlaq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs
index 9240d0e84..5a9727a0a 100644
--- a/library/stdarch/crates/core_arch/src/lib.rs
+++ b/library/stdarch/crates/core_arch/src/lib.rs
@@ -19,6 +19,7 @@
     doc_cfg,
     tbm_target_feature,
     sse4a_target_feature,
+    riscv_target_feature,
     arm_target_feature,
     cmpxchg16b_target_feature,
     avx512_target_feature,
@@ -30,8 +31,8 @@
     f16c_target_feature,
     allow_internal_unstable,
     decl_macro,
-    bench_black_box,
-    asm_const
+    asm_const,
+    target_feature_11
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall))]
 #![deny(clippy::missing_inline_in_public_items)]
diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs
index 1e6a3f405..1c917c52b 100644
--- a/library/stdarch/crates/core_arch/src/macros.rs
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@@ -101,11 +101,11 @@ macro_rules! simd_shuffle2 {
             const IDX: [u32; 2] = $idx;
         }
 
-        simd_shuffle2($x, $y, ConstParam::<$($imm),+>::IDX)
+        simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX)
     }};
     ($x:expr, $y:expr, $idx:expr $(,)?) => {{
         const IDX: [u32; 2] = $idx;
-        simd_shuffle2($x, $y, IDX)
+        simd_shuffle($x, $y, IDX)
     }};
 }
 
@@ -117,11 +117,11 @@ macro_rules! simd_shuffle4 {
             const IDX: [u32; 4] = $idx;
         }
 
-        simd_shuffle4($x, $y, ConstParam::<$($imm),+>::IDX)
+        simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX)
     }};
     ($x:expr, $y:expr, $idx:expr $(,)?) => {{
         const IDX: [u32; 4] = $idx;
-        simd_shuffle4($x, $y, IDX)
+        simd_shuffle($x, $y, IDX)
     }};
 }
 
@@ -133,11 +133,11 @@ macro_rules! simd_shuffle8 {
             const IDX: [u32; 8] = $idx;
         }
 
-        simd_shuffle8($x, $y, ConstParam::<$($imm),+>::IDX)
+        simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX)
     }};
     ($x:expr, $y:expr, $idx:expr $(,)?) => {{
         const IDX: [u32; 8] = $idx;
-        simd_shuffle8($x, $y, IDX)
+        simd_shuffle($x, $y, IDX)
     }};
 }
 
@@ -149,11 +149,11 @@ macro_rules! simd_shuffle16 {
             const IDX: [u32; 16] = $idx;
         }
 
-        simd_shuffle16($x, $y, ConstParam::<$($imm),+>::IDX)
+        simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX)
     }};
     ($x:expr, $y:expr, $idx:expr $(,)?) => {{
         const IDX: [u32; 16] = $idx;
-        simd_shuffle16($x, $y, IDX)
+        simd_shuffle($x, $y, IDX)
     }};
 }
 
@@ -165,11 +165,11 @@ macro_rules! simd_shuffle32 {
             const IDX: [u32; 32] = $idx;
         }
 
-        simd_shuffle32($x, $y, ConstParam::<$($imm),+>::IDX)
+        simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX)
     }};
     ($x:expr, $y:expr, $idx:expr $(,)?) => {{
         const IDX: [u32; 32] = $idx;
-        simd_shuffle32($x, $y, IDX)
+        simd_shuffle($x, $y, IDX)
     }};
 }
 
@@ -181,10 +181,10 @@ macro_rules! simd_shuffle64 {
             const IDX: [u32; 64] = $idx;
         }
 
-        simd_shuffle64($x, $y, ConstParam::<$($imm),+>::IDX)
+        simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX)
     }};
     ($x:expr, $y:expr, $idx:expr $(,)?) => {{
         const IDX: [u32; 64] = $idx;
-        simd_shuffle64($x, $y, IDX)
+        simd_shuffle($x, $y, IDX)
     }};
 }
diff --git a/library/stdarch/crates/core_arch/src/mod.rs b/library/stdarch/crates/core_arch/src/mod.rs
index 20751eeec..2f7af22cb 100644
--- a/library/stdarch/crates/core_arch/src/mod.rs
+++ b/library/stdarch/crates/core_arch/src/mod.rs
@@ -3,6 +3,9 @@
 #[macro_use]
 mod macros;
 
+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))]
+mod riscv_shared;
+
 #[cfg(any(target_arch = "arm", target_arch = "aarch64", doc))]
 mod arm_shared;
 
@@ -276,10 +279,6 @@ mod aarch64;
 #[doc(cfg(any(target_arch = "arm")))]
 mod arm;
 
-#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))]
-#[doc(cfg(any(target_arch = "riscv32", target_arch = "riscv64")))]
-mod riscv_shared;
-
 #[cfg(any(target_arch = "riscv64", doc))]
 #[doc(cfg(any(target_arch = "riscv64")))]
 mod riscv64;
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
index 347735df1..0e35fe1f1 100644
--- a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
@@ -1,4 +1,7 @@
 //! Shared RISC-V intrinsics
+mod p;
+
+pub use p::*;
 
 use crate::arch::asm;
 
@@ -469,6 +472,17 @@ pub unsafe fn hinval_gvma_vmid(vmid: usize) {
     asm!(".insn r 0x73, 0, 0x33, x0, x0, {}", in(reg) vmid, options(nostack))
 }
 
+/// Invalidate hypervisor translation cache for all virtual machines and guest physical addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies all guest physical addresses and all virtual machines.
+#[inline]
+pub unsafe fn hinval_gvma_all() {
+    asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack))
+}
+
 /// Reads the floating-point control and status register `fcsr`
 ///
 /// Register `fcsr` is a 32-bit read/write register that selects the dynamic rounding mode
@@ -574,17 +588,6 @@ pub fn fsflags(value: u32) -> u32 {
     original
 }
 
-/// Invalidate hypervisor translation cache for all virtual machines and guest physical addresses
-///
-/// This instruction invalidates any address-translation cache entries that an
-/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
-///
-/// This fence specifies all guest physical addresses and all virtual machines.
-#[inline]
-pub unsafe fn hinval_gvma_all() {
-    asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack))
-}
-
 /// `P0` transformation function as is used in the SM3 hash algorithm
 ///
 /// This function is included in `Zksh` extension. It's defined as:
@@ -602,12 +605,10 @@ pub unsafe fn hinval_gvma_all() {
 /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
 /// this instruction must always be independent from the data it operates on.
 #[inline]
+#[target_feature(enable = "zksh")]
 pub fn sm3p0(x: u32) -> u32 {
     let ans: u32;
-    unsafe {
-        // asm!("sm3p0 {}, {}", out(reg) ans, in(reg) x, options(nomem, nostack))
-        asm!(".insn i 0x13, 0x1, {}, {}, 0x108", out(reg) ans, in(reg) x, options(nomem, nostack))
-    };
+    unsafe { asm!("sm3p0 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) };
     ans
 }
 
@@ -634,12 +635,10 @@ pub fn sm3p0(x: u32) -> u32 {
 /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
 /// this instruction must always be independent from the data it operates on.
 #[inline]
+#[target_feature(enable = "zksh")]
 pub fn sm3p1(x: u32) -> u32 {
     let ans: u32;
-    unsafe {
-        // asm!("sm3p1 {}, {}", out(reg) ans, in(reg) x, options(nomem, nostack))
-        asm!(".insn i 0x13, 0x1, {}, {}, 0x109", out(reg) ans, in(reg) x, options(nomem, nostack))
-    };
+    unsafe { asm!("sm3p1 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) };
     ans
 }
 
@@ -674,33 +673,28 @@ pub fn sm3p1(x: u32) -> u32 {
 /// It can be implemented by `sm4ed` instruction like:
 ///
 /// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
 /// let a = x1 ^ x2 ^ x3 ^ rk;
 /// let c0 = sm4ed::<0>(x0, a);
 /// let c1 = sm4ed::<1>(c0, a); // c1 represents c[0..=1], etc.
 /// let c2 = sm4ed::<2>(c1, a);
 /// let c3 = sm4ed::<3>(c2, a);
 /// return c3; // c3 represents c[0..=3]
+/// # }
 /// ```
 ///
 /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
 /// this instruction must always be independent from the data it operates on.
+#[inline]
+#[target_feature(enable = "zksed")]
 pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 {
     static_assert!(BS: u8 where BS <= 3);
     let ans: u32;
-    match BS {
-        0 => unsafe {
-            asm!(".insn r 0x33, 0, 0x18, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
-        },
-        1 => unsafe {
-            asm!(".insn r 0x33, 0, 0x38, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
-        },
-        2 => unsafe {
-            asm!(".insn r 0x33, 0, 0x58, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
-        },
-        3 => unsafe {
-            asm!(".insn r 0x33, 0, 0x78, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
-        },
-        _ => unreachable!(),
+    unsafe {
+        asm!("sm4ed {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) a, const BS, options(pure, nomem, nostack))
     };
     ans
 }
@@ -739,33 +733,28 @@ pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 {
 /// Hence, the key schedule operation can be implemented by `sm4ks` instruction like:
 ///
 /// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn key_schedule(k0: u32, k1: u32, k2: u32, k3: u32, ck_i: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ks;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ks;
 /// let k = k1 ^ k2 ^ k3 ^ ck_i;
 /// let c0 = sm4ks::<0>(k0, k);
 /// let c1 = sm4ks::<1>(c0, k); // c1 represents c[0..=1], etc.
 /// let c2 = sm4ks::<2>(c1, k);
 /// let c3 = sm4ks::<3>(c2, k);
 /// return c3; // c3 represents c[0..=3]
+/// # }
 /// ```
 ///
 /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
 /// this instruction must always be independent from the data it operates on.
+#[inline]
+#[target_feature(enable = "zksed")]
 pub fn sm4ks<const BS: u8>(x: u32, k: u32) -> u32 {
     static_assert!(BS: u8 where BS <= 3);
     let ans: u32;
-    match BS {
-        0 => unsafe {
-            asm!(".insn r 0x33, 0, 0x1A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
-        },
-        1 => unsafe {
-            asm!(".insn r 0x33, 0, 0x3A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
-        },
-        2 => unsafe {
-            asm!(".insn r 0x33, 0, 0x5A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
-        },
-        3 => unsafe {
-            asm!(".insn r 0x33, 0, 0x7A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
-        },
-        _ => unreachable!(),
+    unsafe {
+        asm!("sm4ks {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) k, const BS, options(pure, nomem, nostack))
     };
     ans
 }
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/p.rs b/library/stdarch/crates/core_arch/src/riscv_shared/p.rs
new file mode 100644
index 000000000..a26044aee
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/p.rs
@@ -0,0 +1,1061 @@
+//! RISC-V Packed SIMD intrinsics; shared part.
+//!
+//! RV64 only part is placed in riscv64 folder.
+use crate::arch::asm;
+
+/// Adds packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn add16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x20, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 16-bit signed numbers, dropping least bits
+#[inline]
+pub fn radd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x00, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn uradd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x10, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn kadd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x08, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn ukadd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x18, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn sub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x21, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 16-bit signed numbers, dropping least bits
+#[inline]
+pub fn rsub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x01, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn ursub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x11, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn ksub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x09, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn uksub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x19, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross adds and subtracts packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn cras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x22, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of adds and subtracts packed 16-bit signed numbers, dropping least bits
+#[inline]
+pub fn rcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x02, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of adds and subtracts packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn urcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x12, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross adds and subtracts packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn kcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross adds and subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn ukcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross subtracts and adds packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn crsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x23, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of subtracts and adds packed 16-bit signed numbers, dropping least bits
+#[inline]
+pub fn rcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x03, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of subtracts and adds packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn urcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x13, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross subtracts and adds packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn kcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross subtracts and adds packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn ukcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight adds and subtracts packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn stas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of adds and subtracts packed 16-bit signed numbers, dropping least bits
+#[inline]
+pub fn rstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x5A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of adds and subtracts packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn urstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x6A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight adds and subtracts packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn kstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x62, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight adds and subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn ukstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x72, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight subtracts and adds packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn stsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of subtracts and adds packed 16-bit signed numbers, dropping least bits
+#[inline]
+pub fn rstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x5B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of subtracts and adds packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn urstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x6B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight subtracts and adds packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn kstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x63, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight subtracts and adds packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn ukstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x73, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 8-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn add8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x24, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 8-bit signed numbers, dropping least bits
+#[inline]
+pub fn radd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x04, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 8-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn uradd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x14, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 8-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn kadd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 8-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn ukadd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 8-bit signed numbers, discarding overflow bits
+#[inline]
+pub fn sub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x25, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 8-bit signed numbers, dropping least bits
+#[inline]
+pub fn rsub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x05, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 8-bit unsigned numbers, dropping least bits
+#[inline]
+pub fn ursub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x15, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 8-bit signed numbers, saturating at the numeric bounds
+#[inline]
+pub fn ksub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 8-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+pub fn uksub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 16-bit elements without rounding up
+#[inline]
+pub fn sra16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x28, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 16-bit elements with rounding up
+#[inline]
+pub fn sra16u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x30, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 16-bit elements without rounding up
+#[inline]
+pub fn srl16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x29, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 16-bit elements with rounding up
+#[inline]
+pub fn srl16u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x31, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 16-bit elements, discarding overflow bits
+#[inline]
+pub fn sll16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 16-bit elements, saturating at the numeric bounds
+#[inline]
+pub fn ksll16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x32, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 16-bit elements
+#[inline]
+pub fn kslra16(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 16-bit elements
+#[inline]
+pub fn kslra16u(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x33, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 8-bit elements without rounding up
+#[inline]
+pub fn sra8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 8-bit elements with rounding up
+#[inline]
+pub fn sra8u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x34, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 8-bit elements without rounding up
+#[inline]
+pub fn srl8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 8-bit elements with rounding up
+#[inline]
+pub fn srl8u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x35, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 8-bit elements, discarding overflow bits
+#[inline]
+pub fn sll8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 8-bit elements, saturating at the numeric bounds
+#[inline]
+pub fn ksll8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x36, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 8-bit elements
+#[inline]
+pub fn kslra8(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 8-bit elements
+#[inline]
+pub fn kslra8u(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x37, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare equality for packed 16-bit elements
+#[inline]
+pub fn cmpeq16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x26, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed signed integers are less than the others
+#[inline]
+pub fn scmplt16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x06, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed signed integers are less than or equal to the others
+#[inline]
+pub fn scmple16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed unsigned integers are less than the others
+#[inline]
+pub fn ucmplt16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x16, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed unsigned integers are less than or equal to the others
+#[inline]
+pub fn ucmple16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare equality for packed 8-bit elements
+#[inline]
+pub fn cmpeq8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x27, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed signed integers are less than the others
+#[inline]
+pub fn scmplt8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x07, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed signed integers are less than or equal to the others
+#[inline]
+pub fn scmple8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed unsigned integers are less than the others
+#[inline]
+pub fn ucmplt8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x17, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed unsigned integers are less than or equal to the others
+#[inline]
+pub fn ucmple8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 16-bit packed signed integers
+#[inline]
+pub fn smin16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x40, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 16-bit packed unsigned integers
+#[inline]
+pub fn umin16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x48, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 16-bit packed signed integers
+#[inline]
+pub fn smax16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x41, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 16-bit packed unsigned integers
+#[inline]
+pub fn umax16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x49, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/* todo: sclip16, uclip16 */
+
+/// Compute the absolute value of packed 16-bit signed integers
+#[inline]
+pub fn kabs16(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAD1", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of redundant sign bits of the packed 16-bit elements
+#[inline]
+pub fn clrs16(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAE8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of leading zero bits of the packed 16-bit elements
+#[inline]
+pub fn clz16(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAE9", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Swap the 16-bit halfwords within each 32-bit word of a register
+#[inline]
+pub fn swap16(a: usize) -> usize {
+    let value: usize;
+    // this instruction is an alias for `pkbt rd, rs1, rs1`.
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 8-bit packed signed integers
+#[inline]
+pub fn smin8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x44, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 8-bit packed unsigned integers
+#[inline]
+pub fn umin8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x4C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 8-bit packed signed integers
+#[inline]
+pub fn smax8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x45, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 8-bit packed unsigned integers
+#[inline]
+pub fn umax8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x4D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/* todo: sclip8, uclip8 */
+
+/// Compute the absolute value of packed 8-bit signed integers
+#[inline]
+pub fn kabs8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAD0", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of redundant sign bits of the packed 8-bit elements
+#[inline]
+pub fn clrs8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAE0", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of leading zero bits of the packed 8-bit elements
+#[inline]
+pub fn clz8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAE1", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Swap the 8-bit bytes within each 16-bit halfword of a register.
+#[inline]
+pub fn swap8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAD8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack first and zeroth into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+pub fn sunpkd810(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAC8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack second and zeroth into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+pub fn sunpkd820(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAC9", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and zeroth into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+pub fn sunpkd830(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xACA", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and first into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+pub fn sunpkd831(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xACB", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and second into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+pub fn sunpkd832(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAD3", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack first and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+pub fn zunpkd810(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xACC", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack second and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+pub fn zunpkd820(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xACD", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+pub fn zunpkd830(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xACE", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and first into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+pub fn zunpkd831(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xACF", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and second into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+pub fn zunpkd832(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAD7", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+// todo: pkbb16, pktt16
+
+/// Pack two 16-bit data from bottom and top half from 32-bit chunks
+#[inline]
+pub fn pkbt16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Pack two 16-bit data from top and bottom half from 32-bit chunks
+#[inline]
+pub fn pktb16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x1F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of redundant sign bits of the packed 32-bit elements
+#[inline]
+pub fn clrs32(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAF8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of leading zero bits of the packed 32-bit elements
+#[inline]
+pub fn clz32(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, 0xAF9", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Calculate the sum of absolute difference of unsigned 8-bit data elements
+#[inline]
+pub fn pbsad(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Calculate and accumulate the sum of absolute difference of unsigned 8-bit data elements
+#[inline]
+pub fn pbsada(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7F, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Multiply signed 8-bit elements and add 16-bit elements on results for packed 32-bit chunks
+#[inline]
+pub fn smaqa(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x64, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Multiply unsigned 8-bit elements and add 16-bit elements on results for packed 32-bit chunks
+#[inline]
+pub fn umaqa(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x66, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Multiply signed to unsigned 8-bit and add 16-bit elements on results for packed 32-bit chunks
+#[inline]
+pub fn smaqasu(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x65, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds signed lower 16-bit content of two registers with Q15 saturation
+#[inline]
+pub fn kaddh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x02, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts signed lower 16-bit content of two registers with Q15 saturation
+#[inline]
+pub fn ksubh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x03, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds signed lower 16-bit content of two registers with U16 saturation
+#[inline]
+pub fn ukaddh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x0A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts signed lower 16-bit content of two registers with U16 saturation
+#[inline]
+pub fn uksubh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x0B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
diff --git a/library/stdarch/crates/core_arch/src/simd_llvm.rs b/library/stdarch/crates/core_arch/src/simd_llvm.rs
index 1970e5c69..decdecaaf 100644
--- a/library/stdarch/crates/core_arch/src/simd_llvm.rs
+++ b/library/stdarch/crates/core_arch/src/simd_llvm.rs
@@ -9,13 +9,7 @@ extern "platform-intrinsic" {
     pub fn simd_gt<T, U>(x: T, y: T) -> U;
     pub fn simd_ge<T, U>(x: T, y: T) -> U;
 
-    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
-    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
-    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
-    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
-    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
-    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
-    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
+    pub fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
 
     #[rustc_const_unstable(feature = "const_simd_insert", issue = "none")]
     pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index 24f9c0301..16add3dbb 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -2001,7 +2001,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmovmskb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
-    pmovmskb(a.as_i8x32())
+    simd_bitmask::<_, u32>(a.as_i8x32()) as i32
 }
 
 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
@@ -3642,8 +3642,6 @@ extern "C" {
     fn pminud(a: u32x8, b: u32x8) -> u32x8;
     #[link_name = "llvm.x86.avx2.pminu.b"]
     fn pminub(a: u8x32, b: u8x32) -> u8x32;
-    #[link_name = "llvm.x86.avx2.pmovmskb"]
-    fn pmovmskb(a: i8x32) -> i32;
     #[link_name = "llvm.x86.avx2.mpsadbw"]
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
     #[link_name = "llvm.x86.avx2.pmulhu.w"]
diff --git a/library/stdarch/crates/core_arch/src/x86/cpuid.rs b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
index 6b90295ef..2624e8bdf 100644
--- a/library/stdarch/crates/core_arch/src/x86/cpuid.rs
+++ b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
@@ -62,27 +62,27 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
     #[cfg(target_arch = "x86")]
     {
         asm!(
-            "movl %ebx, {0}",
+            "mov {0}, ebx",
             "cpuid",
-            "xchgl %ebx, {0}",
-            lateout(reg) ebx,
-            inlateout("eax") leaf => eax,
-            inlateout("ecx") sub_leaf => ecx,
-            lateout("edx") edx,
-            options(nostack, preserves_flags, att_syntax),
+            "xchg {0}, ebx",
+            out(reg) ebx,
+            inout("eax") leaf => eax,
+            inout("ecx") sub_leaf => ecx,
+            out("edx") edx,
+            options(nostack, preserves_flags),
         );
     }
     #[cfg(target_arch = "x86_64")]
     {
         asm!(
-            "movq %rbx, {0:r}",
+            "mov {0:r}, rbx",
             "cpuid",
-            "xchgq %rbx, {0:r}",
-            lateout(reg) ebx,
-            inlateout("eax") leaf => eax,
-            inlateout("ecx") sub_leaf => ecx,
-            lateout("edx") edx,
-            options(nostack, preserves_flags, att_syntax),
+            "xchg {0:r}, rbx",
+            out(reg) ebx,
+            inout("eax") leaf => eax,
+            inout("ecx") sub_leaf => ecx,
+            out("edx") edx,
+            options(nostack, preserves_flags),
         );
     }
     CpuidResult { eax, ebx, ecx, edx }
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
index 547bfe67d..6b50e95b2 100644
--- a/library/stdarch/crates/core_arch/src/x86/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -306,7 +306,7 @@ types! {
 
     /// 256-bit wide set of 16 'u16' types, x86-specific
     ///
-    /// This type is the same as the `__m128bh` type defined by Intel,
+    /// This type is the same as the `__m256bh` type defined by Intel,
     /// representing a 256-bit SIMD register which internally is consisted of
     /// 16 packed `u16` instances. Its purpose is for bf16 related intrinsic
     /// implementations.
@@ -317,7 +317,7 @@ types! {
 
     /// 512-bit wide set of 32 'u16' types, x86-specific
     ///
-    /// This type is the same as the `__m128bh` type defined by Intel,
+    /// This type is the same as the `__m512bh` type defined by Intel,
     /// representing a 512-bit SIMD register which internally is consisted of
     /// 32 packed `u16` instances. Its purpose is for bf16 related intrinsic
     /// implementations.
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index d82b8641f..3e79b3539 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -1378,7 +1378,7 @@ pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 #[cfg_attr(test, assert_instr(pmovmskb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
-    pmovmskb(a.as_i8x16())
+    simd_bitmask::<_, u16>(a.as_i8x16()) as u32 as i32
 }
 
 /// Shuffles 32-bit integers in `a` using the control in `IMM8`.
@@ -2856,8 +2856,6 @@ extern "C" {
     fn packssdw(a: i32x4, b: i32x4) -> i16x8;
     #[link_name = "llvm.x86.sse2.packuswb.128"]
     fn packuswb(a: i16x8, b: i16x8) -> u8x16;
-    #[link_name = "llvm.x86.sse2.pmovmskb.128"]
-    fn pmovmskb(a: i8x16) -> i32;
     #[link_name = "llvm.x86.sse2.max.sd"]
     fn maxsd(a: __m128d, b: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse2.max.pd"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
index ab0dd38fe..61f8a4e78 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -1,11 +1,7 @@
 //! Streaming SIMD Extensions 3 (SSE3)
 
 use crate::{
-    core_arch::{
-        simd::*,
-        simd_llvm::{simd_shuffle2, simd_shuffle4},
-        x86::*,
-    },
+    core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*},
     mem::transmute,
 };
 
diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml
index 1ca0d9c5d..3a482564e 100644
--- a/library/stdarch/crates/std_detect/Cargo.toml
+++ b/library/stdarch/crates/std_detect/Cargo.toml
@@ -22,7 +22,7 @@ maintenance = { status = "experimental" }
 
 [dependencies]
 libc = { version = "0.2", optional = true, default-features = false }
-cfg-if = "0.1.10"
+cfg-if = "1.0.0"
 
 # When built as part of libstd
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
index b6a2e5218..6c79ba86d 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
@@ -23,58 +23,62 @@ pub(crate) fn detect_features() -> cache::Initializer {
 /// The names match those used for cpuinfo.
 ///
 /// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+#[derive(Debug, Default, PartialEq)]
 struct AtHwcap {
-    fp: bool,    // 0
-    asimd: bool, // 1
-    // evtstrm: bool, // 2 No LLVM support
-    aes: bool,     // 3
-    pmull: bool,   // 4
-    sha1: bool,    // 5
-    sha2: bool,    // 6
-    crc32: bool,   // 7
-    atomics: bool, // 8
-    fphp: bool,    // 9
-    asimdhp: bool, // 10
-    // cpuid: bool, // 11 No LLVM support
-    asimdrdm: bool, // 12
-    jscvt: bool,    // 13
-    fcma: bool,     // 14
-    lrcpc: bool,    // 15
-    dcpop: bool,    // 16
-    sha3: bool,     // 17
-    sm3: bool,      // 18
-    sm4: bool,      // 19
-    asimddp: bool,  // 20
-    sha512: bool,   // 21
-    sve: bool,      // 22
-    fhm: bool,      // 23
-    dit: bool,      // 24
-    uscat: bool,    // 25
-    ilrcpc: bool,   // 26
-    flagm: bool,    // 27
-    ssbs: bool,     // 28
-    sb: bool,       // 29
-    paca: bool,     // 30
-    pacg: bool,     // 31
-    dcpodp: bool,   // 32
-    sve2: bool,     // 33
-    sveaes: bool,   // 34
-    // svepmull: bool, // 35 No LLVM support
-    svebitperm: bool, // 36
-    svesha3: bool,    // 37
-    svesm4: bool,     // 38
-    // flagm2: bool, // 39 No LLVM support
-    frint: bool, // 40
-    // svei8mm: bool, // 41 See i8mm feature
-    svef32mm: bool, // 42
-    svef64mm: bool, // 43
-    // svebf16: bool, // 44 See bf16 feature
-    i8mm: bool, // 45
-    bf16: bool, // 46
-    // dgh: bool, // 47 No LLVM support
-    rng: bool, // 48
-    bti: bool, // 49
-    mte: bool, // 50
+    // AT_HWCAP
+    fp: bool,
+    asimd: bool,
+    // evtstrm: No LLVM support.
+    aes: bool,
+    pmull: bool,
+    sha1: bool,
+    sha2: bool,
+    crc32: bool,
+    atomics: bool,
+    fphp: bool,
+    asimdhp: bool,
+    // cpuid: No LLVM support.
+    asimdrdm: bool,
+    jscvt: bool,
+    fcma: bool,
+    lrcpc: bool,
+    dcpop: bool,
+    sha3: bool,
+    sm3: bool,
+    sm4: bool,
+    asimddp: bool,
+    sha512: bool,
+    sve: bool,
+    fhm: bool,
+    dit: bool,
+    uscat: bool,
+    ilrcpc: bool,
+    flagm: bool,
+    ssbs: bool,
+    sb: bool,
+    paca: bool,
+    pacg: bool,
+
+    // AT_HWCAP2
+    dcpodp: bool,
+    sve2: bool,
+    sveaes: bool,
+    // svepmull: No LLVM support.
+    svebitperm: bool,
+    svesha3: bool,
+    svesm4: bool,
+    // flagm2: No LLVM support.
+    frint: bool,
+    // svei8mm: See i8mm feature.
+    svef32mm: bool,
+    svef64mm: bool,
+    // svebf16: See bf16 feature.
+    i8mm: bool,
+    bf16: bool,
+    // dgh: No LLVM support.
+    rng: bool,
+    bti: bool,
+    mte: bool,
 }
 
 impl From<auxvec::AuxVec> for AtHwcap {
@@ -113,25 +117,25 @@ impl From<auxvec::AuxVec> for AtHwcap {
             sb: bit::test(auxv.hwcap, 29),
             paca: bit::test(auxv.hwcap, 30),
             pacg: bit::test(auxv.hwcap, 31),
-            dcpodp: bit::test(auxv.hwcap, 32),
-            sve2: bit::test(auxv.hwcap, 33),
-            sveaes: bit::test(auxv.hwcap, 34),
-            // svepmull: bit::test(auxv.hwcap, 35),
-            svebitperm: bit::test(auxv.hwcap, 36),
-            svesha3: bit::test(auxv.hwcap, 37),
-            svesm4: bit::test(auxv.hwcap, 38),
-            // flagm2: bit::test(auxv.hwcap, 39),
-            frint: bit::test(auxv.hwcap, 40),
-            // svei8mm: bit::test(auxv.hwcap, 41),
-            svef32mm: bit::test(auxv.hwcap, 42),
-            svef64mm: bit::test(auxv.hwcap, 43),
-            // svebf16: bit::test(auxv.hwcap, 44),
-            i8mm: bit::test(auxv.hwcap, 45),
-            bf16: bit::test(auxv.hwcap, 46),
-            // dgh: bit::test(auxv.hwcap, 47),
-            rng: bit::test(auxv.hwcap, 48),
-            bti: bit::test(auxv.hwcap, 49),
-            mte: bit::test(auxv.hwcap, 50),
+            dcpodp: bit::test(auxv.hwcap2, 0),
+            sve2: bit::test(auxv.hwcap2, 1),
+            sveaes: bit::test(auxv.hwcap2, 2),
+            // svepmull: bit::test(auxv.hwcap2, 3),
+            svebitperm: bit::test(auxv.hwcap2, 4),
+            svesha3: bit::test(auxv.hwcap2, 5),
+            svesm4: bit::test(auxv.hwcap2, 6),
+            // flagm2: bit::test(auxv.hwcap2, 7),
+            frint: bit::test(auxv.hwcap2, 8),
+            // svei8mm: bit::test(auxv.hwcap2, 9),
+            svef32mm: bit::test(auxv.hwcap2, 10),
+            svef64mm: bit::test(auxv.hwcap2, 11),
+            // svebf16: bit::test(auxv.hwcap2, 12),
+            i8mm: bit::test(auxv.hwcap2, 13),
+            bf16: bit::test(auxv.hwcap2, 14),
+            // dgh: bit::test(auxv.hwcap2, 15),
+            rng: bit::test(auxv.hwcap2, 16),
+            bti: bit::test(auxv.hwcap2, 17),
+            mte: bit::test(auxv.hwcap2, 18),
         }
     }
 }
@@ -288,3 +292,86 @@ impl AtHwcap {
         value
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg(feature = "std_detect_file_io")]
+    mod auxv_from_file {
+        use super::auxvec::auxv_from_file;
+        use super::*;
+        // The baseline hwcaps used in the (artificial) auxv test files.
+        fn baseline_hwcaps() -> AtHwcap {
+            AtHwcap {
+                fp: true,
+                asimd: true,
+                aes: true,
+                pmull: true,
+                sha1: true,
+                sha2: true,
+                crc32: true,
+                atomics: true,
+                fphp: true,
+                asimdhp: true,
+                asimdrdm: true,
+                lrcpc: true,
+                dcpop: true,
+                asimddp: true,
+                ssbs: true,
+                ..AtHwcap::default()
+            }
+        }
+
+        #[test]
+        fn linux_empty_hwcap2_aarch64() {
+            let file = concat!(
+                env!("CARGO_MANIFEST_DIR"),
+                "/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv"
+            );
+            println!("file: {}", file);
+            let v = auxv_from_file(file).unwrap();
+            println!("HWCAP : 0x{:0x}", v.hwcap);
+            println!("HWCAP2: 0x{:0x}", v.hwcap2);
+            assert_eq!(AtHwcap::from(v), baseline_hwcaps());
+        }
+        #[test]
+        fn linux_no_hwcap2_aarch64() {
+            let file = concat!(
+                env!("CARGO_MANIFEST_DIR"),
+                "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv"
+            );
+            println!("file: {}", file);
+            let v = auxv_from_file(file).unwrap();
+            println!("HWCAP : 0x{:0x}", v.hwcap);
+            println!("HWCAP2: 0x{:0x}", v.hwcap2);
+            assert_eq!(AtHwcap::from(v), baseline_hwcaps());
+        }
+        #[test]
+        fn linux_hwcap2_aarch64() {
+            let file = concat!(
+                env!("CARGO_MANIFEST_DIR"),
+                "/src/detect/test_data/linux-hwcap2-aarch64.auxv"
+            );
+            println!("file: {}", file);
+            let v = auxv_from_file(file).unwrap();
+            println!("HWCAP : 0x{:0x}", v.hwcap);
+            println!("HWCAP2: 0x{:0x}", v.hwcap2);
+            assert_eq!(
+                AtHwcap::from(v),
+                AtHwcap {
+                    // Some other HWCAP bits.
+                    paca: true,
+                    pacg: true,
+                    // HWCAP2-only bits.
+                    dcpodp: true,
+                    frint: true,
+                    rng: true,
+                    bti: true,
+                    mte: true,
+                    ..baseline_hwcaps()
+                }
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
index e6447d0cd..c903903bd 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -7,6 +7,7 @@ pub(crate) const AT_NULL: usize = 0;
 pub(crate) const AT_HWCAP: usize = 16;
 /// Key to access the CPU Hardware capabilities 2 bitfield.
 #[cfg(any(
+    target_arch = "aarch64",
     target_arch = "arm",
     target_arch = "powerpc",
     target_arch = "powerpc64"
@@ -21,6 +22,7 @@ pub(crate) const AT_HWCAP2: usize = 26;
 pub(crate) struct AuxVec {
     pub hwcap: usize,
     #[cfg(any(
+        target_arch = "aarch64",
         target_arch = "arm",
         target_arch = "powerpc",
         target_arch = "powerpc64"
@@ -64,13 +66,14 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
         if let Ok(hwcap) = getauxval(AT_HWCAP) {
             // Targets with only AT_HWCAP:
             #[cfg(any(
-                target_arch = "aarch64",
                 target_arch = "riscv32",
                 target_arch = "riscv64",
                 target_arch = "mips",
                 target_arch = "mips64"
             ))]
             {
+                // Zero could indicate that no features were detected, but it's also used to
+                // indicate an error. In either case, try the fallback.
                 if hwcap != 0 {
                     return Ok(AuxVec { hwcap });
                 }
@@ -78,13 +81,18 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
 
             // Targets with AT_HWCAP and AT_HWCAP2:
             #[cfg(any(
+                target_arch = "aarch64",
                 target_arch = "arm",
                 target_arch = "powerpc",
                 target_arch = "powerpc64"
             ))]
             {
                 if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
-                    if hwcap != 0 && hwcap2 != 0 {
+                    // Zero could indicate that no features were detected, but it's also used to
+                    // indicate an error. In particular, on many platforms AT_HWCAP2 will be
+                    // legitimately zero, since it contains the most recent feature flags. Use the
+                    // fallback only if no features were detected at all.
+                    if hwcap != 0 || hwcap2 != 0 {
                         return Ok(AuxVec { hwcap, hwcap2 });
                     }
                 }
@@ -97,7 +105,6 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
     {
         // Targets with only AT_HWCAP:
         #[cfg(any(
-            target_arch = "aarch64",
             target_arch = "riscv32",
             target_arch = "riscv64",
             target_arch = "mips",
@@ -105,6 +112,8 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
         ))]
         {
             let hwcap = unsafe { libc::getauxval(AT_HWCAP as libc::c_ulong) as usize };
+            // Zero could indicate that no features were detected, but it's also used to indicate
+            // an error. In either case, try the fallback.
             if hwcap != 0 {
                 return Ok(AuxVec { hwcap });
             }
@@ -112,6 +121,7 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
 
         // Targets with AT_HWCAP and AT_HWCAP2:
         #[cfg(any(
+            target_arch = "aarch64",
             target_arch = "arm",
             target_arch = "powerpc",
             target_arch = "powerpc64"
@@ -119,7 +129,11 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
         {
             let hwcap = unsafe { libc::getauxval(AT_HWCAP as libc::c_ulong) as usize };
             let hwcap2 = unsafe { libc::getauxval(AT_HWCAP2 as libc::c_ulong) as usize };
-            if hwcap != 0 && hwcap2 != 0 {
+            // Zero could indicate that no features were detected, but it's also used to indicate
+            // an error. In particular, on many platforms AT_HWCAP2 will be legitimately zero,
+            // since it contains the most recent feature flags. Use the fallback only if no
+            // features were detected at all.
+            if hwcap != 0 || hwcap2 != 0 {
                 return Ok(AuxVec { hwcap, hwcap2 });
             }
         }
@@ -158,7 +172,7 @@ fn getauxval(key: usize) -> Result<usize, ()> {
 /// Tries to read the auxiliary vector from the `file`. If this fails, this
 /// function returns `Err`.
 #[cfg(feature = "std_detect_file_io")]
-fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
+pub(super) fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
     let file = super::read_file(file)?;
 
     // See <https://github.com/torvalds/linux/blob/v3.19/include/uapi/linux/auxvec.h>.
@@ -181,7 +195,6 @@ fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
 fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> {
     // Targets with only AT_HWCAP:
     #[cfg(any(
-        target_arch = "aarch64",
         target_arch = "riscv32",
         target_arch = "riscv64",
         target_arch = "mips",
@@ -198,23 +211,25 @@ fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> {
     }
     // Targets with AT_HWCAP and AT_HWCAP2:
     #[cfg(any(
+        target_arch = "aarch64",
         target_arch = "arm",
         target_arch = "powerpc",
         target_arch = "powerpc64"
     ))]
     {
         let mut hwcap = None;
-        let mut hwcap2 = None;
+        // For some platforms, AT_HWCAP2 was added recently, so let it default to zero.
+        let mut hwcap2 = 0;
         for el in buf.chunks(2) {
             match el[0] {
                 AT_NULL => break,
                 AT_HWCAP => hwcap = Some(el[1]),
-                AT_HWCAP2 => hwcap2 = Some(el[1]),
+                AT_HWCAP2 => hwcap2 = el[1],
                 _ => (),
             }
         }
 
-        if let (Some(hwcap), Some(hwcap2)) = (hwcap, hwcap2) {
+        if let Some(hwcap) = hwcap {
             return Ok(AuxVec { hwcap, hwcap2 });
         }
     }
@@ -256,7 +271,6 @@ mod tests {
     // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
     // does not always contain the AT_HWCAP key under qemu.
     #[cfg(any(
-        target_arch = "aarch64",
         target_arch = "arm",
         target_arch = "powerpc",
         target_arch = "powerpc64"
@@ -271,6 +285,7 @@ mod tests {
 
         // Targets with AT_HWCAP and AT_HWCAP2:
         #[cfg(any(
+            target_arch = "aarch64",
             target_arch = "arm",
             target_arch = "powerpc",
             target_arch = "powerpc64"
@@ -305,24 +320,31 @@ mod tests {
             }
 
             #[test]
-            #[should_panic]
             fn linux_macos_vb() {
                 let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv");
                 println!("file: {}", file);
+                // The file contains HWCAP but not HWCAP2. In that case, we treat HWCAP2 as zero.
                 let v = auxv_from_file(file).unwrap();
-                // this file is incomplete (contains hwcap but not hwcap2), we
-                // want to fall back to /proc/cpuinfo in this case, so
-                // reading should fail. assert_eq!(v.hwcap, 126614527);
-                // assert_eq!(v.hwcap2, 0);
-                let _ = v;
+                assert_eq!(v.hwcap, 126614527);
+                assert_eq!(v.hwcap2, 0);
             }
         } else if #[cfg(target_arch = "aarch64")] {
             #[test]
-            fn linux_x64() {
-                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-x64-i7-6850k.auxv");
+            fn linux_artificial_aarch64() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-artificial-aarch64.auxv");
                 println!("file: {}", file);
                 let v = auxv_from_file(file).unwrap();
-                assert_eq!(v.hwcap, 3219913727);
+                assert_eq!(v.hwcap, 0x0123456789abcdef);
+                assert_eq!(v.hwcap2, 0x02468ace13579bdf);
+            }
+            #[test]
+            fn linux_no_hwcap2_aarch64() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                // An absent HWCAP2 is treated as zero, and does not prevent acceptance of HWCAP.
+                assert_ne!(v.hwcap, 0);
+                assert_eq!(v.hwcap2, 0);
             }
         }
     }
@@ -353,6 +375,7 @@ mod tests {
 
         // Targets with AT_HWCAP and AT_HWCAP2:
         #[cfg(any(
+            target_arch = "aarch64",
             target_arch = "arm",
             target_arch = "powerpc",
             target_arch = "powerpc64"
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv
new file mode 100644
index 000000000..ec826afcf
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv
new file mode 100644
index 000000000..95537b73f
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv
new file mode 100644
index 000000000..1d87264b2
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv
new file mode 100644
index 000000000..35f01cc76
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv
deleted file mode 100644
index 6afe1b3b4..000000000
--- a/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv
+++ /dev/null
diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml
index 9ac1057be..012b4e959 100644
--- a/library/stdarch/crates/stdarch-test/Cargo.toml
+++ b/library/stdarch/crates/stdarch-test/Cargo.toml
@@ -10,7 +10,7 @@ simd-test-macro = { path = "../simd-test-macro" }
 cc = "1.0"
 lazy_static = "1.0"
 rustc-demangle = "0.1.8"
-cfg-if = "0.1"
+cfg-if = "1.0"
 
 # We use a crates.io dependency to disassemble wasm binaries to look for
 # instructions for `#[assert_instr]`. Note that we use an `=` dependency here
diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs
index 078736c66..eba17771c 100644
--- a/library/stdarch/crates/stdarch-test/src/lib.rs
+++ b/library/stdarch/crates/stdarch-test/src/lib.rs
@@ -3,7 +3,6 @@
 //! This basically just disassembles the current executable and then parses the
 //! output once globally and then provides the `assert` function which makes
 //! assertions about the disassembly of a function.
-#![feature(bench_black_box)] // For black_box
 #![deny(rust_2018_idioms)]
 #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]