7 files changed, 186 insertions, 21 deletions
diff --git a/vendor/compiler_builtins/src/int/shift.rs b/vendor/compiler_builtins/src/int/shift.rs
index 2d2c081a6..c90cf1de3 100644
--- a/vendor/compiler_builtins/src/int/shift.rs
+++ b/vendor/compiler_builtins/src/int/shift.rs
@@ -12,7 +12,7 @@ trait Ashl: DInt {
         } else {
             Self::from_lo_hi(
                 self.lo().wrapping_shl(shl),
-                self.lo().logical_shr(n_h - shl) | self.hi().wrapping_shl(shl),
+                self.lo().logical_shr(n_h.wrapping_sub(shl)) | self.hi().wrapping_shl(shl),
             )
         }
     }
@@ -36,7 +36,7 @@ trait Ashr: DInt {
             self
         } else {
             Self::from_lo_hi(
-                self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h - shr),
+                self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h.wrapping_sub(shr)),
                 self.hi().wrapping_shr(shr),
             )
         }
@@ -57,7 +57,7 @@ trait Lshr: DInt {
             self
         } else {
             Self::from_lo_hi(
-                self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h - shr),
+                self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h.wrapping_sub(shr)),
                 self.hi().logical_shr(shr),
             )
         }
@@ -78,8 +78,8 @@ intrinsics! {
     #[avr_skip]
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_llsl]
-    pub extern "C" fn __ashldi3(a: u64, b: u32) -> u64 {
-        a.ashl(b)
+    pub extern "C" fn __ashldi3(a: u64, b: core::ffi::c_uint) -> u64 {
+        a.ashl(b as u32)
     }
 
     #[avr_skip]
@@ -96,8 +96,8 @@ intrinsics! {
     #[avr_skip]
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_lasr]
-    pub extern "C" fn __ashrdi3(a: i64, b: u32) -> i64 {
-        a.ashr(b)
+    pub extern "C" fn __ashrdi3(a: i64, b: core::ffi::c_uint) -> i64 {
+        a.ashr(b as u32)
     }
 
     #[avr_skip]
@@ -114,8 +114,8 @@ intrinsics! {
     #[avr_skip]
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_llsr]
-    pub extern "C" fn __lshrdi3(a: u64, b: u32) -> u64 {
-        a.lshr(b)
+    pub extern "C" fn __lshrdi3(a: u64, b: core::ffi::c_uint) -> u64 {
+        a.lshr(b as u32)
     }
 
     #[avr_skip]
diff --git a/vendor/compiler_builtins/src/lib.rs b/vendor/compiler_builtins/src/lib.rs
index 10b4aafec..71f249c8e 100644
--- a/vendor/compiler_builtins/src/lib.rs
+++ b/vendor/compiler_builtins/src/lib.rs
@@ -47,6 +47,7 @@ pub mod int;
     all(target_arch = "x86_64", target_os = "none"),
     all(target_arch = "x86_64", target_os = "uefi"),
     all(target_arch = "arm", target_os = "none"),
+    all(target_arch = "xtensa", target_os = "none"),
     target_os = "xous",
     all(target_vendor = "fortanix", target_env = "sgx")
 ))]
diff --git a/vendor/compiler_builtins/src/math.rs b/vendor/compiler_builtins/src/math.rs
index c64984e9e..498e4d85f 100644
--- a/vendor/compiler_builtins/src/math.rs
+++ b/vendor/compiler_builtins/src/math.rs
@@ -86,7 +86,36 @@ no_mangle! {
     fn tanf(n: f32) -> f32;
 }
 
-#[cfg(any(target_os = "xous", target_os = "uefi"))]
+#[cfg(any(
+    all(
+        target_family = "wasm",
+        target_os = "unknown",
+        not(target_env = "wasi")
+    ),
+    target_os = "xous",
+    all(target_arch = "x86_64", target_os = "uefi"),
+    all(target_arch = "xtensa", target_os = "none"),
+    all(target_vendor = "fortanix", target_env = "sgx")
+))]
+intrinsics! {
+    pub extern "C" fn lgamma_r(x: f64, s: &mut i32) -> f64 {
+        let r = self::libm::lgamma_r(x);
+        *s = r.1;
+        r.0
+    }
+
+    pub extern "C" fn lgammaf_r(x: f32, s: &mut i32) -> f32 {
+        let r = self::libm::lgammaf_r(x);
+        *s = r.1;
+        r.0
+    }
+}
+
+#[cfg(any(
+    target_os = "xous",
+    target_os = "uefi",
+    all(target_arch = "xtensa", target_os = "none"),
+))]
 no_mangle! {
     fn sqrtf(x: f32) -> f32;
     fn sqrt(x: f64) -> f64;
@@ -94,6 +123,7 @@ no_mangle! {
 
 #[cfg(any(
     all(target_vendor = "fortanix", target_env = "sgx"),
+    all(target_arch = "xtensa", target_os = "none"),
     target_os = "xous",
     target_os = "uefi"
 ))]
diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs
index 72003a5c4..23c9d8d32 100644
--- a/vendor/compiler_builtins/src/mem/impls.rs
+++ b/vendor/compiler_builtins/src/mem/impls.rs
@@ -279,3 +279,13 @@ pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
     }
     0
 }
+
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    let mut n = 0;
+    while *s != 0 {
+        n += 1;
+        s = s.add(1);
+    }
+    n
+}
diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs
index c5b0ddc16..be118778b 100644
--- a/vendor/compiler_builtins/src/mem/mod.rs
+++ b/vendor/compiler_builtins/src/mem/mod.rs
@@ -63,13 +63,7 @@ intrinsics! {
     #[mem_builtin]
     #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
     pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
-        let mut n = 0;
-        let mut s = s;
-        while *s != 0 {
-            n += 1;
-            s = s.offset(1);
-        }
-        n
+        impls::c_string_length(s)
     }
 }
 
diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs
index 17b461f79..40b67093f 100644
--- a/vendor/compiler_builtins/src/mem/x86_64.rs
+++ b/vendor/compiler_builtins/src/mem/x86_64.rs
@@ -173,6 +173,136 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
     c16(a.cast(), b.cast(), n)
 }
 
+// In order to process more than on byte simultaneously when executing strlen,
+// two things must be considered:
+// * An n byte read with an n-byte aligned address will never cross
+//   a page boundary and will always succeed. Any smaller alignment
+//   may result in a read that will cross a page boundary, which may
+//   trigger an access violation.
+// * Surface Rust considers any kind of out-of-bounds read as undefined
+//   behaviour. To dodge this, memory access operations are written
+//   using inline assembly.
+
+#[cfg(target_feature = "sse2")]
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
+
+    let mut n = 0;
+
+    // The use of _mm_movemask_epi8 and company allow for speedups,
+    // but they aren't cheap by themselves. Thus, possibly small strings
+    // are handled in simple loops.
+
+    for _ in 0..4 {
+        if *s == 0 {
+            return n;
+        }
+
+        n += 1;
+        s = s.add(1);
+    }
+
+    // Shave of the least significand bits to align the address to a 16
+    // byte boundary. The shaved of bits are used to correct the first iteration.
+
+    let align = s as usize & 15;
+    let mut s = ((s as usize) - align) as *const __m128i;
+    let zero = _mm_set1_epi8(0);
+
+    let x = {
+        let r;
+        asm!(
+            "movdqa ({addr}), {dest}",
+            addr = in(reg) s,
+            dest = out(xmm_reg) r,
+            options(att_syntax, nostack),
+        );
+        r
+    };
+    let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
+
+    if cmp != 0 {
+        return n + cmp.trailing_zeros() as usize;
+    }
+
+    n += 16 - align;
+    s = s.add(1);
+
+    loop {
+        let x = {
+            let r;
+            asm!(
+                "movdqa ({addr}), {dest}",
+                addr = in(reg) s,
+                dest = out(xmm_reg) r,
+                options(att_syntax, nostack),
+            );
+            r
+        };
+        let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
+        if cmp == 0 {
+            n += 16;
+            s = s.add(1);
+        } else {
+            return n + cmp.trailing_zeros() as usize;
+        }
+    }
+}
+
+// Provided for scenarios like kernel development, where SSE might not
+// be available.
+#[cfg(not(target_feature = "sse2"))]
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    let mut n = 0;
+
+    // Check bytes in steps of one until
+    // either a zero byte is discovered or
+    // pointer is aligned to an eight byte boundary.
+
+    while s as usize & 7 != 0 {
+        if *s == 0 {
+            return n;
+        }
+        n += 1;
+        s = s.add(1);
+    }
+
+    // Check bytes in steps of eight until a zero
+    // byte is discovered.
+
+    let mut s = s as *const u64;
+
+    loop {
+        let mut cs = {
+            let r: u64;
+            asm!(
+                "mov ({addr}), {dest}",
+                addr = in(reg) s,
+                dest = out(reg) r,
+                options(att_syntax, nostack),
+            );
+            r
+        };
+        // Detect if a word has a zero byte, taken from
+        // https://graphics.stanford.edu/~seander/bithacks.html
+        if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
+            loop {
+                if cs & 255 == 0 {
+                    return n;
+                } else {
+                    cs >>= 8;
+                    n += 1;
+                }
+            }
+        } else {
+            n += 8;
+            s = s.add(1);
+        }
+    }
+}
+
 /// Determine optimal parameters for a `rep` instruction.
 fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
     // Unaligned writes are still slow on modern processors, so align the destination address.
diff --git a/vendor/compiler_builtins/src/riscv.rs b/vendor/compiler_builtins/src/riscv.rs
index ae361b33a..bf3125533 100644
--- a/vendor/compiler_builtins/src/riscv.rs
+++ b/vendor/compiler_builtins/src/riscv.rs
@@ -19,11 +19,11 @@ intrinsics! {
     // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/riscv/int_mul_impl.inc
     pub extern "C" fn __mulsi3(a: u32, b: u32) -> u32 {
         let (mut a, mut b) = (a, b);
-        let mut r = 0;
+        let mut r: u32 = 0;
 
         while a > 0 {
             if a & 1 > 0 {
-                r += b;
+                r = r.wrapping_add(b);
             }
             a >>= 1;
             b <<= 1;
@@ -35,11 +35,11 @@ intrinsics! {
     #[cfg(not(target_feature = "m"))]
     pub extern "C" fn __muldi3(a: u64, b: u64) -> u64 {
         let (mut a, mut b) = (a, b);
-        let mut r = 0;
+        let mut r: u64 = 0;
 
         while a > 0 {
             if a & 1 > 0 {
-                r += b;
+                r = r.wrapping_add(b);
             }
             a >>= 1;
             b <<= 1;