summaryrefslogtreecommitdiffstats
path: root/media/libopus/celt/arm
diff options
context:
space:
mode:
Diffstat (limited to 'media/libopus/celt/arm')
-rw-r--r--media/libopus/celt/arm/arm_celt_map.c31
-rw-r--r--media/libopus/celt/arm/armcpu.c51
-rw-r--r--media/libopus/celt/arm/armcpu.h13
-rw-r--r--media/libopus/celt/arm/celt_neon_intr.c83
-rw-r--r--media/libopus/celt/arm/pitch_neon_intr.c7
5 files changed, 163 insertions, 22 deletions
diff --git a/media/libopus/celt/arm/arm_celt_map.c b/media/libopus/celt/arm/arm_celt_map.c
index ca988b66f5..cbaea49579 100644
--- a/media/libopus/celt/arm/arm_celt_map.c
+++ b/media/libopus/celt/arm/arm_celt_map.c
@@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c
celt_inner_prod_c, /* ARMv4 */
celt_inner_prod_c, /* EDSP */
celt_inner_prod_c, /* Media */
- celt_inner_prod_neon /* NEON */
+ celt_inner_prod_neon,/* NEON */
+ celt_inner_prod_neon /* DOTPROD */
};
void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
@@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o
dual_inner_prod_c, /* ARMv4 */
dual_inner_prod_c, /* EDSP */
dual_inner_prod_c, /* Media */
- dual_inner_prod_neon /* NEON */
+ dual_inner_prod_neon,/* NEON */
+ dual_inner_prod_neon /* DOTPROD */
};
# endif
@@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
celt_pitch_xcorr_c, /* ARMv4 */
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
- MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
+ MAY_HAVE_NEON(celt_pitch_xcorr), /* NEON */
+ MAY_HAVE_NEON(celt_pitch_xcorr) /* DOTPROD */
};
# endif
@@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */
- celt_pitch_xcorr_float_neon /* Neon */
+ celt_pitch_xcorr_float_neon, /* Neon */
+ celt_pitch_xcorr_float_neon /* DOTPROD */
};
# endif
# endif /* FIXED_POINT */
@@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
xcorr_kernel_c, /* EDSP */
xcorr_kernel_c, /* Media */
xcorr_kernel_neon_fixed, /* Neon */
+ xcorr_kernel_neon_fixed /* DOTPROD */
};
#endif
@@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_alloc_arch_c, /* ARMv4 */
opus_fft_alloc_arch_c, /* EDSP */
opus_fft_alloc_arch_c, /* Media */
- opus_fft_alloc_arm_neon /* Neon with NE10 library support */
+ opus_fft_alloc_arm_neon, /* Neon with NE10 library support */
+ opus_fft_alloc_arm_neon /* DOTPROD with NE10 library support */
};
void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_free_arch_c, /* ARMv4 */
opus_fft_free_arch_c, /* EDSP */
opus_fft_free_arch_c, /* Media */
- opus_fft_free_arm_neon /* Neon with NE10 */
+ opus_fft_free_arm_neon, /* Neon with NE10 */
+ opus_fft_free_arm_neon /* DOTPROD with NE10 */
};
# endif /* CUSTOM_MODES */
@@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
opus_fft_c, /* ARMv4 */
opus_fft_c, /* EDSP */
opus_fft_c, /* Media */
- opus_fft_neon /* Neon with NE10 */
+ opus_fft_neon, /* Neon with NE10 */
+ opus_fft_neon /* DOTPROD with NE10 */
};
void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
@@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
opus_ifft_c, /* ARMv4 */
opus_ifft_c, /* EDSP */
opus_ifft_c, /* Media */
- opus_ifft_neon /* Neon with NE10 */
+ opus_ifft_neon, /* Neon with NE10 */
+ opus_ifft_neon /* DOTPROD with NE10 */
};
void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
clt_mdct_forward_c, /* ARMv4 */
clt_mdct_forward_c, /* EDSP */
clt_mdct_forward_c, /* Media */
- clt_mdct_forward_neon /* Neon with NE10 */
+ clt_mdct_forward_neon, /* Neon with NE10 */
+ clt_mdct_forward_neon /* DOTPROD with NE10 */
};
void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
clt_mdct_backward_c, /* ARMv4 */
clt_mdct_backward_c, /* EDSP */
clt_mdct_backward_c, /* Media */
- clt_mdct_backward_neon /* Neon with NE10 */
+ clt_mdct_backward_neon, /* Neon with NE10 */
+ clt_mdct_backward_neon /* DOTPROD with NE10 */
};
# endif /* HAVE_ARM_NE10 */
diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c
index c7d16e6d61..06a53435b8 100644
--- a/media/libopus/celt/arm/armcpu.c
+++ b/media/libopus/celt/arm/armcpu.c
@@ -43,6 +43,7 @@
#define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP)
#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
#define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON)
+#define OPUS_CPU_ARM_DOTPROD_FLAG (1<<OPUS_ARCH_ARM_DOTPROD)
#if defined(_MSC_VER)
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -126,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void)
p = strstr(buf, " neon");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_NEON_FLAG;
+ p = strstr(buf, " asimd");
+ if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
+ flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+# endif
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+ p = strstr(buf, " asimddp");
+ if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
}
# endif
@@ -144,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void)
# endif
}
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+ flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
fclose(cpuinfo);
}
return flags;
}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+opus_uint32 opus_cpu_capabilities(void)
+{
+ opus_uint32 flags = 0;
+
+#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+ size_t size = sizeof(uint32_t);
+ uint32_t value = 0;
+ if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
+ {
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+ }
+#endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+ flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+ return flags;
+}
+
#else
/* The feature registers which can tell us what the processor supports are
* accessible in priveleged modes only, so we can't have a general user-space
@@ -180,7 +223,13 @@ static int opus_select_arch_impl(void)
}
arch++;
- celt_assert(arch == OPUS_ARCH_ARM_NEON);
+ if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
+ celt_assert(arch == OPUS_ARCH_ARM_NEON);
+ return arch;
+ }
+ arch++;
+
+ celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
return arch;
}
diff --git a/media/libopus/celt/arm/armcpu.h b/media/libopus/celt/arm/armcpu.h
index 820262ff5f..6d5803d81a 100644
--- a/media/libopus/celt/arm/armcpu.h
+++ b/media/libopus/celt/arm/armcpu.h
@@ -46,6 +46,12 @@
# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
# endif
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+# define MAY_HAVE_DOTPROD(name) name ## _dotprod
+# else
+# define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
+# endif
+
# if defined(OPUS_ARM_PRESUME_EDSP)
# define PRESUME_EDSP(name) name ## _edsp
# else
@@ -64,6 +70,12 @@
# define PRESUME_NEON(name) PRESUME_MEDIA(name)
# endif
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+# define PRESUME_DOTPROD(name) name ## _dotprod
+# else
+# define PRESUME_DOTPROD(name) PRESUME_NEON(name)
+# endif
+
# if defined(OPUS_HAVE_RTCD)
int opus_select_arch(void);
@@ -71,6 +83,7 @@ int opus_select_arch(void);
#define OPUS_ARCH_ARM_EDSP (1)
#define OPUS_ARCH_ARM_MEDIA (2)
#define OPUS_ARCH_ARM_NEON (3)
+#define OPUS_ARCH_ARM_DOTPROD (4)
# endif
diff --git a/media/libopus/celt/arm/celt_neon_intr.c b/media/libopus/celt/arm/celt_neon_intr.c
index effda769d0..250f836218 100644
--- a/media/libopus/celt/arm/celt_neon_intr.c
+++ b/media/libopus/celt/arm/celt_neon_intr.c
@@ -38,6 +38,8 @@
#include "../pitch.h"
#if defined(FIXED_POINT)
+#include <string.h>
+
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
@@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
int16x4_t y0 = vld1_s16(y);
y += 4;
- for (j = 0; j + 8 <= len; j += 8)
+ /* This loop loads one y value more than we actually need.
+ Therefore we have to stop as soon as there are 8 or fewer samples left
+ (instead of 7), to avoid reading past the end of the array. */
+ for (j = 0; j + 8 < len; j += 8)
{
/* Load x[0...7] */
int16x8_t xx = vld1q_s16(x);
@@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
x += 8;
y += 8;
}
-
- for (; j < len; j++)
- {
- int16x4_t x0 = vld1_dup_s16(x); /* load next x */
+ if (j + 4 < len) {
+ /* Load x[0...3] */
+ int16x4_t x0 = vld1_s16(x);
+ /* Load y[4...7] */
+ int16x4_t y4 = vld1_s16(y);
+ int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+ int16x4_t y1 = vext_s16(y0, y4, 1);
+ int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+ int16x4_t y2 = vext_s16(y0, y4, 2);
+ int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+ int16x4_t y3 = vext_s16(y0, y4, 3);
+ int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+ y0 = y4;
+ a = a3;
+ x += 4;
+ y += 4;
+ j += 4;
+ }
+ if (j + 2 < len) {
+ /* Load x[0...1] */
+ int16x4x2_t xx = vld2_dup_s16(x);
+ int16x4_t x0 = xx.val[0];
+ int16x4_t x1 = xx.val[1];
+ /* Load y[4...5].
+ We would like to use vld1_dup_s32(), but casting the pointer would
+ break strict aliasing rules and potentially have alignment issues.
+ Fortunately the compiler seems capable of translating this memcpy()
+ and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+ int32_t yy;
+ memcpy(&yy, y, sizeof(yy));
+ int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
int32x4_t a0 = vmlal_s16(a, y0, x0);
-
- int16x4_t y4 = vld1_dup_s16(y); /* load next y */
- y0 = vext_s16(y0, y4, 1);
+ int16x4_t y1 = vext_s16(y0, y4, 1);
+ /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+ using VSRI instead of VEXT, since it's a data-processing
+ instruction. */
+ y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+ vreinterpret_s64_s16(y0), 32));
+ int32x4_t a1 = vmlal_s16(a0, y1, x1);
+ a = a1;
+ x += 2;
+ y += 2;
+ j += 2;
+ }
+ if (j + 1 < len) {
+ /* Load next x. */
+ int16x4_t x0 = vld1_dup_s16(x);
+ int32x4_t a0 = vmlal_s16(a, y0, x0);
+ /* Load last y. */
+ int16x4_t y4 = vld1_dup_s16(y);
+ y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+ vreinterpret_s64_s16(y0), 16));
a = a0;
x++;
- y++;
}
-
- vst1q_s32(sum, a);
+ /* Load last x. */
+ int16x4_t x0 = vld1_dup_s16(x);
+ int32x4_t a0 = vmlal_s16(a, y0, x0);
+ vst1q_s32(sum, a0);
}
#else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ * vmlaq_f32() into fmul/fadd. */
+#ifdef vmlaq_lane_f32
+#undef vmlaq_lane_f32
+#endif
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
diff --git a/media/libopus/celt/arm/pitch_neon_intr.c b/media/libopus/celt/arm/pitch_neon_intr.c
index 35cc46e2c2..43885f528c 100644
--- a/media/libopus/celt/arm/pitch_neon_intr.c
+++ b/media/libopus/celt/arm/pitch_neon_intr.c
@@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
/* ========================================================================== */
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+ vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
#ifdef OPUS_CHECK_ASM
/* This part of code simulates floating-point NEON operations. */