From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 19 Apr 2024 03:14:29 +0200
Subject: Merging upstream version 125.0.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 media/libopus/celt/arm/arm_celt_map.c    | 31 ++++++++----
 media/libopus/celt/arm/armcpu.c          | 51 +++++++++++++++++++-
 media/libopus/celt/arm/armcpu.h          | 13 +++++
 media/libopus/celt/arm/celt_neon_intr.c  | 83 +++++++++++++++++++++++++++-----
 media/libopus/celt/arm/pitch_neon_intr.c |  7 +++
 5 files changed, 163 insertions(+), 22 deletions(-)

(limited to 'media/libopus/celt/arm')

diff --git a/media/libopus/celt/arm/arm_celt_map.c b/media/libopus/celt/arm/arm_celt_map.c
index ca988b66f5..cbaea49579 100644
--- a/media/libopus/celt/arm/arm_celt_map.c
+++ b/media/libopus/celt/arm/arm_celt_map.c
@@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c
   celt_inner_prod_c,   /* ARMv4 */
   celt_inner_prod_c,   /* EDSP */
   celt_inner_prod_c,   /* Media */
-  celt_inner_prod_neon /* NEON */
+  celt_inner_prod_neon,/* NEON */
+  celt_inner_prod_neon /* DOTPROD */
 };
 
 void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
@@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o
   dual_inner_prod_c,   /* ARMv4 */
   dual_inner_prod_c,   /* EDSP */
   dual_inner_prod_c,   /* Media */
-  dual_inner_prod_neon /* NEON */
+  dual_inner_prod_neon,/* NEON */
+  dual_inner_prod_neon /* DOTPROD */
 };
 # endif
 
@@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   celt_pitch_xcorr_c,               /* ARMv4 */
   MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
-  MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr),  /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* DOTPROD */
 };
 
 #  endif
@@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   celt_pitch_xcorr_c,              /* ARMv4 */
   celt_pitch_xcorr_c,              /* EDSP */
   celt_pitch_xcorr_c,              /* Media */
-  celt_pitch_xcorr_float_neon      /* Neon */
+  celt_pitch_xcorr_float_neon,     /* Neon */
+  celt_pitch_xcorr_float_neon      /* DOTPROD */
 };
 #  endif
 # endif /* FIXED_POINT */
@@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
   xcorr_kernel_c,                /* EDSP */
   xcorr_kernel_c,                /* Media */
   xcorr_kernel_neon_fixed,       /* Neon */
+  xcorr_kernel_neon_fixed        /* DOTPROD */
 };
 
 #endif
@@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
    opus_fft_alloc_arch_c,        /* ARMv4 */
    opus_fft_alloc_arch_c,        /* EDSP */
    opus_fft_alloc_arch_c,        /* Media */
-   opus_fft_alloc_arm_neon       /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon,      /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon       /* DOTPROD with NE10 library support */
 };
 
 void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
    opus_fft_free_arch_c,         /* ARMv4 */
    opus_fft_free_arch_c,         /* EDSP */
    opus_fft_free_arch_c,         /* Media */
-   opus_fft_free_arm_neon        /* Neon with NE10 */
+   opus_fft_free_arm_neon,       /* Neon with NE10 */
+   opus_fft_free_arm_neon        /* DOTPROD with NE10 */
 };
 #   endif /* CUSTOM_MODES */
 
@@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
    opus_fft_c,                   /* ARMv4 */
    opus_fft_c,                   /* EDSP */
    opus_fft_c,                   /* Media */
-   opus_fft_neon                 /* Neon with NE10 */
+   opus_fft_neon,                /* Neon with NE10 */
+   opus_fft_neon                 /* DOTPROD with NE10 */
 };
 
 void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
@@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
    opus_ifft_c,                   /* ARMv4 */
    opus_ifft_c,                   /* EDSP */
    opus_ifft_c,                   /* Media */
-   opus_ifft_neon                 /* Neon with NE10 */
+   opus_ifft_neon,                /* Neon with NE10 */
+   opus_ifft_neon                 /* DOTPROD with NE10 */
 };
 
 void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
    clt_mdct_forward_c,           /* ARMv4 */
    clt_mdct_forward_c,           /* EDSP */
    clt_mdct_forward_c,           /* Media */
-   clt_mdct_forward_neon         /* Neon with NE10 */
+   clt_mdct_forward_neon,        /* Neon with NE10 */
+   clt_mdct_forward_neon         /* DOTPROD with NE10 */
 };
 
 void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
    clt_mdct_backward_c,           /* ARMv4 */
    clt_mdct_backward_c,           /* EDSP */
    clt_mdct_backward_c,           /* Media */
-   clt_mdct_backward_neon         /* Neon with NE10 */
+   clt_mdct_backward_neon,        /* Neon with NE10 */
+   clt_mdct_backward_neon         /* DOTPROD with NE10 */
 };
 
 #  endif /* HAVE_ARM_NE10 */
diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c
index c7d16e6d61..06a53435b8 100644
--- a/media/libopus/celt/arm/armcpu.c
+++ b/media/libopus/celt/arm/armcpu.c
@@ -43,6 +43,7 @@
 #define OPUS_CPU_ARM_EDSP_FLAG  (1<<OPUS_ARCH_ARM_EDSP)
 #define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
 #define OPUS_CPU_ARM_NEON_FLAG  (1<<OPUS_ARCH_ARM_NEON)
+#define OPUS_CPU_ARM_DOTPROD_FLAG  (1<<OPUS_ARCH_ARM_DOTPROD)
 
 #if defined(_MSC_VER)
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -126,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_NEON_FLAG;
+        p = strstr(buf, " asimd");
+        if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
+          flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+#  endif
+#  if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+        p = strstr(buf, " asimddp");
+        if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
+          flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
 #  endif
       }
 # endif
@@ -144,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void)
 # endif
     }
 
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
     fclose(cpuinfo);
   }
   return flags;
 }
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+opus_uint32 opus_cpu_capabilities(void)
+{
+  opus_uint32 flags = 0;
+
+#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+  size_t size = sizeof(uint32_t);
+  uint32_t value = 0;
+  if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
+  {
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+  }
+#endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+  flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+  flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+  return flags;
+}
+
 #else
 /* The feature registers which can tell us what the processor supports are
  * accessible in priveleged modes only, so we can't have a general user-space
@@ -180,7 +223,13 @@ static int opus_select_arch_impl(void)
   }
   arch++;
 
-  celt_assert(arch == OPUS_ARCH_ARM_NEON);
+  if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_NEON);
+    return arch;
+  }
+  arch++;
+
+  celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
   return arch;
 }
 
diff --git a/media/libopus/celt/arm/armcpu.h b/media/libopus/celt/arm/armcpu.h
index 820262ff5f..6d5803d81a 100644
--- a/media/libopus/celt/arm/armcpu.h
+++ b/media/libopus/celt/arm/armcpu.h
@@ -46,6 +46,12 @@
 #  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
 # endif
 
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+#  define MAY_HAVE_DOTPROD(name) name ## _dotprod
+# else
+#  define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
+# endif
+
 # if defined(OPUS_ARM_PRESUME_EDSP)
 #  define PRESUME_EDSP(name) name ## _edsp
 # else
@@ -64,6 +70,12 @@
 #  define PRESUME_NEON(name) PRESUME_MEDIA(name)
 # endif
 
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+#  define PRESUME_DOTPROD(name) name ## _dotprod
+# else
+#  define PRESUME_DOTPROD(name) PRESUME_NEON(name)
+# endif
+
 # if defined(OPUS_HAVE_RTCD)
 int opus_select_arch(void);
 
@@ -71,6 +83,7 @@ int opus_select_arch(void);
 #define OPUS_ARCH_ARM_EDSP  (1)
 #define OPUS_ARCH_ARM_MEDIA (2)
 #define OPUS_ARCH_ARM_NEON  (3)
+#define OPUS_ARCH_ARM_DOTPROD  (4)
 
 # endif
 
diff --git a/media/libopus/celt/arm/celt_neon_intr.c b/media/libopus/celt/arm/celt_neon_intr.c
index effda769d0..250f836218 100644
--- a/media/libopus/celt/arm/celt_neon_intr.c
+++ b/media/libopus/celt/arm/celt_neon_intr.c
@@ -38,6 +38,8 @@
 #include "../pitch.h"
 
 #if defined(FIXED_POINT)
+#include <string.h>
+
 void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 {
    int j;
@@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
    int16x4_t y0 = vld1_s16(y);
    y += 4;
 
-   for (j = 0; j + 8 <= len; j += 8)
+   /* This loop loads one y value more than we actually need.
+      Therefore we have to stop as soon as there are 8 or fewer samples left
+       (instead of 7), to avoid reading past the end of the array. */
+   for (j = 0; j + 8 < len; j += 8)
    {
       /* Load x[0...7] */
       int16x8_t xx = vld1q_s16(x);
@@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
       x += 8;
       y += 8;
    }
-
-   for (; j < len; j++)
-   {
-      int16x4_t x0 = vld1_dup_s16(x);  /* load next x */
+   if (j + 4 < len) {
+      /* Load x[0...3] */
+      int16x4_t x0 = vld1_s16(x);
+      /* Load y[4...7] */
+      int16x4_t y4 = vld1_s16(y);
+      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+      int16x4_t y2 = vext_s16(y0, y4, 2);
+      int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+      int16x4_t y3 = vext_s16(y0, y4, 3);
+      int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+      y0 = y4;
+      a = a3;
+      x += 4;
+      y += 4;
+      j += 4;
+   }
+   if (j + 2 < len) {
+      /* Load x[0...1] */
+      int16x4x2_t xx = vld2_dup_s16(x);
+      int16x4_t x0 = xx.val[0];
+      int16x4_t x1 = xx.val[1];
+      /* Load y[4...5].
+         We would like to use vld1_dup_s32(), but casting the pointer would
+          break strict aliasing rules and potentially have alignment issues.
+         Fortunately the compiler seems capable of translating this memcpy()
+          and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+      int32_t yy;
+      memcpy(&yy, y, sizeof(yy));
+      int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
       int32x4_t a0 = vmlal_s16(a, y0, x0);
-
-      int16x4_t y4 = vld1_dup_s16(y);  /* load next y */
-      y0 = vext_s16(y0, y4, 1);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+          using VSRI instead of VEXT, since it's a data-processing
+          instruction. */
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 32));
+      int32x4_t a1 = vmlal_s16(a0, y1, x1);
+      a = a1;
+      x += 2;
+      y += 2;
+      j += 2;
+   }
+   if (j + 1 < len) {
+      /* Load next x. */
+      int16x4_t x0 = vld1_dup_s16(x);
+      int32x4_t a0 = vmlal_s16(a, y0, x0);
+      /* Load last y. */
+      int16x4_t y4 = vld1_dup_s16(y);
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 16));
       a = a0;
       x++;
-      y++;
    }
-
-   vst1q_s32(sum, a);
+   /* Load last x. */
+   int16x4_t x0 = vld1_dup_s16(x);
+   int32x4_t a0 = vmlal_s16(a, y0, x0);
+   vst1q_s32(sum, a0);
 }
 
 #else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ *    vmlaq_f32() into fmul/fadd. */
+#ifdef vmlaq_lane_f32
+#undef vmlaq_lane_f32
+#endif
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
 /*
  * Function: xcorr_kernel_neon_float
  * ---------------------------------
diff --git a/media/libopus/celt/arm/pitch_neon_intr.c b/media/libopus/celt/arm/pitch_neon_intr.c
index 35cc46e2c2..43885f528c 100644
--- a/media/libopus/celt/arm/pitch_neon_intr.c
+++ b/media/libopus/celt/arm/pitch_neon_intr.c
@@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 
 /* ========================================================================== */
 
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+   vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
 #ifdef OPUS_CHECK_ASM
 
 /* This part of code simulates floating-point NEON operations. */
-- 
cgit v1.2.3