summaryrefslogtreecommitdiffstats
path: root/media/libopus/celt
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 01:13:27 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 01:13:27 +0000
commit40a355a42d4a9444dc753c04c6608dade2f06a23 (patch)
tree871fc667d2de662f171103ce5ec067014ef85e61 /media/libopus/celt
parentAdding upstream version 124.0.1. (diff)
downloadfirefox-upstream/125.0.1.tar.xz
firefox-upstream/125.0.1.zip
Adding upstream version 125.0.1.upstream/125.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media/libopus/celt')
-rw-r--r--media/libopus/celt/arm/arm_celt_map.c31
-rw-r--r--media/libopus/celt/arm/armcpu.c51
-rw-r--r--media/libopus/celt/arm/armcpu.h13
-rw-r--r--media/libopus/celt/arm/celt_neon_intr.c83
-rw-r--r--media/libopus/celt/arm/pitch_neon_intr.c7
-rw-r--r--media/libopus/celt/celt.h25
-rw-r--r--media/libopus/celt/celt_decoder.c360
-rw-r--r--media/libopus/celt/celt_encoder.c68
-rw-r--r--media/libopus/celt/celt_lpc.c27
-rw-r--r--media/libopus/celt/celt_lpc.h2
-rw-r--r--media/libopus/celt/cpu_support.h7
-rw-r--r--media/libopus/celt/entdec.c21
-rw-r--r--media/libopus/celt/entdec.h10
-rw-r--r--media/libopus/celt/entenc.c11
-rw-r--r--media/libopus/celt/entenc.h9
-rw-r--r--media/libopus/celt/laplace.c101
-rw-r--r--media/libopus/celt/laplace.h9
-rw-r--r--media/libopus/celt/mathops.h6
-rw-r--r--media/libopus/celt/mips/celt_mipsr1.h6
-rw-r--r--media/libopus/celt/mips/mdct_mipsr1.h6
-rw-r--r--media/libopus/celt/mips/vq_mipsr1.h6
-rw-r--r--media/libopus/celt/os_support.h14
-rw-r--r--media/libopus/celt/pitch.c11
-rw-r--r--media/libopus/celt/pitch.h11
-rw-r--r--media/libopus/celt/stack_alloc.h2
-rw-r--r--media/libopus/celt/x86/celt_lpc_sse4_1.c13
-rw-r--r--media/libopus/celt/x86/pitch_avx.c101
-rw-r--r--media/libopus/celt/x86/pitch_sse.h40
-rw-r--r--media/libopus/celt/x86/vq_sse.h6
-rw-r--r--media/libopus/celt/x86/vq_sse2.c8
-rw-r--r--media/libopus/celt/x86/x86_arch_macros.h47
-rw-r--r--media/libopus/celt/x86/x86_celt_map.c20
-rw-r--r--media/libopus/celt/x86/x86cpu.c16
-rw-r--r--media/libopus/celt/x86/x86cpu.h49
34 files changed, 1014 insertions, 183 deletions
diff --git a/media/libopus/celt/arm/arm_celt_map.c b/media/libopus/celt/arm/arm_celt_map.c
index ca988b66f5..cbaea49579 100644
--- a/media/libopus/celt/arm/arm_celt_map.c
+++ b/media/libopus/celt/arm/arm_celt_map.c
@@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c
celt_inner_prod_c, /* ARMv4 */
celt_inner_prod_c, /* EDSP */
celt_inner_prod_c, /* Media */
- celt_inner_prod_neon /* NEON */
+ celt_inner_prod_neon,/* NEON */
+ celt_inner_prod_neon /* DOTPROD */
};
void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
@@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o
dual_inner_prod_c, /* ARMv4 */
dual_inner_prod_c, /* EDSP */
dual_inner_prod_c, /* Media */
- dual_inner_prod_neon /* NEON */
+ dual_inner_prod_neon,/* NEON */
+ dual_inner_prod_neon /* DOTPROD */
};
# endif
@@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
celt_pitch_xcorr_c, /* ARMv4 */
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
- MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
+ MAY_HAVE_NEON(celt_pitch_xcorr), /* NEON */
+ MAY_HAVE_NEON(celt_pitch_xcorr) /* DOTPROD */
};
# endif
@@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */
- celt_pitch_xcorr_float_neon /* Neon */
+ celt_pitch_xcorr_float_neon, /* Neon */
+ celt_pitch_xcorr_float_neon /* DOTPROD */
};
# endif
# endif /* FIXED_POINT */
@@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
xcorr_kernel_c, /* EDSP */
xcorr_kernel_c, /* Media */
xcorr_kernel_neon_fixed, /* Neon */
+ xcorr_kernel_neon_fixed /* DOTPROD */
};
#endif
@@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_alloc_arch_c, /* ARMv4 */
opus_fft_alloc_arch_c, /* EDSP */
opus_fft_alloc_arch_c, /* Media */
- opus_fft_alloc_arm_neon /* Neon with NE10 library support */
+ opus_fft_alloc_arm_neon, /* Neon with NE10 library support */
+ opus_fft_alloc_arm_neon /* DOTPROD with NE10 library support */
};
void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_free_arch_c, /* ARMv4 */
opus_fft_free_arch_c, /* EDSP */
opus_fft_free_arch_c, /* Media */
- opus_fft_free_arm_neon /* Neon with NE10 */
+ opus_fft_free_arm_neon, /* Neon with NE10 */
+ opus_fft_free_arm_neon /* DOTPROD with NE10 */
};
# endif /* CUSTOM_MODES */
@@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
opus_fft_c, /* ARMv4 */
opus_fft_c, /* EDSP */
opus_fft_c, /* Media */
- opus_fft_neon /* Neon with NE10 */
+ opus_fft_neon, /* Neon with NE10 */
+ opus_fft_neon /* DOTPROD with NE10 */
};
void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
@@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
opus_ifft_c, /* ARMv4 */
opus_ifft_c, /* EDSP */
opus_ifft_c, /* Media */
- opus_ifft_neon /* Neon with NE10 */
+ opus_ifft_neon, /* Neon with NE10 */
+ opus_ifft_neon /* DOTPROD with NE10 */
};
void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
clt_mdct_forward_c, /* ARMv4 */
clt_mdct_forward_c, /* EDSP */
clt_mdct_forward_c, /* Media */
- clt_mdct_forward_neon /* Neon with NE10 */
+ clt_mdct_forward_neon, /* Neon with NE10 */
+ clt_mdct_forward_neon /* DOTPROD with NE10 */
};
void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
clt_mdct_backward_c, /* ARMv4 */
clt_mdct_backward_c, /* EDSP */
clt_mdct_backward_c, /* Media */
- clt_mdct_backward_neon /* Neon with NE10 */
+ clt_mdct_backward_neon, /* Neon with NE10 */
+ clt_mdct_backward_neon /* DOTPROD with NE10 */
};
# endif /* HAVE_ARM_NE10 */
diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c
index c7d16e6d61..06a53435b8 100644
--- a/media/libopus/celt/arm/armcpu.c
+++ b/media/libopus/celt/arm/armcpu.c
@@ -43,6 +43,7 @@
#define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP)
#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
#define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON)
+#define OPUS_CPU_ARM_DOTPROD_FLAG (1<<OPUS_ARCH_ARM_DOTPROD)
#if defined(_MSC_VER)
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -126,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void)
p = strstr(buf, " neon");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_NEON_FLAG;
+ p = strstr(buf, " asimd");
+ if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
+ flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+# endif
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+ p = strstr(buf, " asimddp");
+ if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
}
# endif
@@ -144,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void)
# endif
}
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+ flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
fclose(cpuinfo);
}
return flags;
}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+opus_uint32 opus_cpu_capabilities(void)
+{
+ opus_uint32 flags = 0;
+
+#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+ size_t size = sizeof(uint32_t);
+ uint32_t value = 0;
+ if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
+ {
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+ }
+#endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+ flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+ return flags;
+}
+
#else
/* The feature registers which can tell us what the processor supports are
* accessible in priveleged modes only, so we can't have a general user-space
@@ -180,7 +223,13 @@ static int opus_select_arch_impl(void)
}
arch++;
- celt_assert(arch == OPUS_ARCH_ARM_NEON);
+ if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
+ celt_assert(arch == OPUS_ARCH_ARM_NEON);
+ return arch;
+ }
+ arch++;
+
+ celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
return arch;
}
diff --git a/media/libopus/celt/arm/armcpu.h b/media/libopus/celt/arm/armcpu.h
index 820262ff5f..6d5803d81a 100644
--- a/media/libopus/celt/arm/armcpu.h
+++ b/media/libopus/celt/arm/armcpu.h
@@ -46,6 +46,12 @@
# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
# endif
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+# define MAY_HAVE_DOTPROD(name) name ## _dotprod
+# else
+# define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
+# endif
+
# if defined(OPUS_ARM_PRESUME_EDSP)
# define PRESUME_EDSP(name) name ## _edsp
# else
@@ -64,6 +70,12 @@
# define PRESUME_NEON(name) PRESUME_MEDIA(name)
# endif
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+# define PRESUME_DOTPROD(name) name ## _dotprod
+# else
+# define PRESUME_DOTPROD(name) PRESUME_NEON(name)
+# endif
+
# if defined(OPUS_HAVE_RTCD)
int opus_select_arch(void);
@@ -71,6 +83,7 @@ int opus_select_arch(void);
#define OPUS_ARCH_ARM_EDSP (1)
#define OPUS_ARCH_ARM_MEDIA (2)
#define OPUS_ARCH_ARM_NEON (3)
+#define OPUS_ARCH_ARM_DOTPROD (4)
# endif
diff --git a/media/libopus/celt/arm/celt_neon_intr.c b/media/libopus/celt/arm/celt_neon_intr.c
index effda769d0..250f836218 100644
--- a/media/libopus/celt/arm/celt_neon_intr.c
+++ b/media/libopus/celt/arm/celt_neon_intr.c
@@ -38,6 +38,8 @@
#include "../pitch.h"
#if defined(FIXED_POINT)
+#include <string.h>
+
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
@@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
int16x4_t y0 = vld1_s16(y);
y += 4;
- for (j = 0; j + 8 <= len; j += 8)
+ /* This loop loads one y value more than we actually need.
+ Therefore we have to stop as soon as there are 8 or fewer samples left
+ (instead of 7), to avoid reading past the end of the array. */
+ for (j = 0; j + 8 < len; j += 8)
{
/* Load x[0...7] */
int16x8_t xx = vld1q_s16(x);
@@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
x += 8;
y += 8;
}
-
- for (; j < len; j++)
- {
- int16x4_t x0 = vld1_dup_s16(x); /* load next x */
+ if (j + 4 < len) {
+ /* Load x[0...3] */
+ int16x4_t x0 = vld1_s16(x);
+ /* Load y[4...7] */
+ int16x4_t y4 = vld1_s16(y);
+ int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+ int16x4_t y1 = vext_s16(y0, y4, 1);
+ int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+ int16x4_t y2 = vext_s16(y0, y4, 2);
+ int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+ int16x4_t y3 = vext_s16(y0, y4, 3);
+ int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+ y0 = y4;
+ a = a3;
+ x += 4;
+ y += 4;
+ j += 4;
+ }
+ if (j + 2 < len) {
+ /* Load x[0...1] */
+ int16x4x2_t xx = vld2_dup_s16(x);
+ int16x4_t x0 = xx.val[0];
+ int16x4_t x1 = xx.val[1];
+ /* Load y[4...5].
+ We would like to use vld1_dup_s32(), but casting the pointer would
+ break strict aliasing rules and potentially have alignment issues.
+ Fortunately the compiler seems capable of translating this memcpy()
+ and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+ int32_t yy;
+ memcpy(&yy, y, sizeof(yy));
+ int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
int32x4_t a0 = vmlal_s16(a, y0, x0);
-
- int16x4_t y4 = vld1_dup_s16(y); /* load next y */
- y0 = vext_s16(y0, y4, 1);
+ int16x4_t y1 = vext_s16(y0, y4, 1);
+ /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+ using VSRI instead of VEXT, since it's a data-processing
+ instruction. */
+ y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+ vreinterpret_s64_s16(y0), 32));
+ int32x4_t a1 = vmlal_s16(a0, y1, x1);
+ a = a1;
+ x += 2;
+ y += 2;
+ j += 2;
+ }
+ if (j + 1 < len) {
+ /* Load next x. */
+ int16x4_t x0 = vld1_dup_s16(x);
+ int32x4_t a0 = vmlal_s16(a, y0, x0);
+ /* Load last y. */
+ int16x4_t y4 = vld1_dup_s16(y);
+ y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+ vreinterpret_s64_s16(y0), 16));
a = a0;
x++;
- y++;
}
-
- vst1q_s32(sum, a);
+ /* Load last x. */
+ int16x4_t x0 = vld1_dup_s16(x);
+ int32x4_t a0 = vmlal_s16(a, y0, x0);
+ vst1q_s32(sum, a0);
}
#else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ * vmlaq_f32() into fmul/fadd. */
+#ifdef vmlaq_lane_f32
+#undef vmlaq_lane_f32
+#endif
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
diff --git a/media/libopus/celt/arm/pitch_neon_intr.c b/media/libopus/celt/arm/pitch_neon_intr.c
index 35cc46e2c2..43885f528c 100644
--- a/media/libopus/celt/arm/pitch_neon_intr.c
+++ b/media/libopus/celt/arm/pitch_neon_intr.c
@@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
/* ========================================================================== */
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+ vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
#ifdef OPUS_CHECK_ASM
/* This part of code simulates floating-point NEON operations. */
diff --git a/media/libopus/celt/celt.h b/media/libopus/celt/celt.h
index 24b6b2b520..2f501951d5 100644
--- a/media/libopus/celt/celt.h
+++ b/media/libopus/celt/celt.h
@@ -42,6 +42,10 @@
#include "entdec.h"
#include "arch.h"
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -149,6 +153,13 @@ int celt_decoder_get_size(int channels);
int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+ int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+ ,LPCNetPLCState *lpcnet
+#endif
+ );
+
int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);
@@ -225,23 +236,13 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
const opus_val16 *window, int overlap, int arch);
-#ifdef NON_STATIC_COMB_FILTER_CONST_C
-void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
- opus_val16 g10, opus_val16 g11, opus_val16 g12);
-#endif
-
-#ifndef OVERRIDE_COMB_FILTER_CONST
-# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
- ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
-#endif
-
void init_caps(const CELTMode *m,int *cap,int LM,int C);
#ifdef RESYNTH
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem);
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum);
void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
- int LM, int downsample, int silence);
+ int LM, int downsample, int silence, int arch);
#endif
#ifdef __cplusplus
diff --git a/media/libopus/celt/celt_decoder.c b/media/libopus/celt/celt_decoder.c
index 883dae15d2..743c2031bc 100644
--- a/media/libopus/celt/celt_decoder.c
+++ b/media/libopus/celt/celt_decoder.c
@@ -51,6 +51,11 @@
#include "celt_lpc.h"
#include "vq.h"
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+#endif
+
/* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
current value corresponds to a pitch of 66.67 Hz. */
@@ -59,9 +64,6 @@
pitch of 480 Hz. */
#define PLC_PITCH_LAG_MIN (100)
-#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT)
-#define NORM_ALIASING_HACK
-#endif
/**********************************************************************/
/* */
/* DECODER */
@@ -69,6 +71,9 @@
/**********************************************************************/
#define DECODE_BUFFER_SIZE 2048
+#define PLC_UPDATE_FRAMES 4
+#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE)
+
/** Decoder state
@brief Decoder state
*/
@@ -82,6 +87,7 @@ struct OpusCustomDecoder {
int start, end;
int signalling;
int disable_inv;
+ int complexity;
int arch;
/* Everything beyond this point gets cleared on a reset */
@@ -98,11 +104,18 @@ struct OpusCustomDecoder {
opus_val16 postfilter_gain_old;
int postfilter_tapset;
int postfilter_tapset_old;
+ int prefilter_and_fold;
celt_sig preemph_memD[2];
+#ifdef ENABLE_DEEP_PLC
+ opus_int16 plc_pcm[PLC_UPDATE_SAMPLES];
+ int plc_fill;
+ float plc_preemphasis_mem;
+#endif
+
celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
- /* opus_val16 lpc[], Size = channels*LPC_ORDER */
+ /* opus_val16 lpc[], Size = channels*CELT_LPC_ORDER */
/* opus_val16 oldEBands[], Size = 2*mode->nbEBands */
/* opus_val16 oldLogE[], Size = 2*mode->nbEBands */
/* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */
@@ -157,7 +170,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int
{
int size = sizeof(struct CELTDecoder)
+ (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
- + channels*LPC_ORDER*sizeof(opus_val16)
+ + channels*CELT_LPC_ORDER*sizeof(opus_val16)
+ 4*2*mode->nbEBands*sizeof(opus_val16);
return size;
}
@@ -499,7 +512,100 @@ static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
return pitch_index;
}
-static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
+static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N)
+{
+ int c;
+ int CC;
+ int i;
+ int overlap;
+ celt_sig *decode_mem[2];
+ const OpusCustomMode *mode;
+ VARDECL(opus_val32, etmp);
+ mode = st->mode;
+ overlap = st->overlap;
+ CC = st->channels;
+ ALLOC(etmp, overlap, opus_val32);
+ c=0; do {
+ decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+ } while (++c<CC);
+
+ c=0; do {
+ /* Apply the pre-filter to the MDCT overlap for the next frame because
+ the post-filter will be re-applied in the decoder after the MDCT
+ overlap. */
+ comb_filter(etmp, decode_mem[c]+DECODE_BUFFER_SIZE-N,
+ st->postfilter_period_old, st->postfilter_period, overlap,
+ -st->postfilter_gain_old, -st->postfilter_gain,
+ st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch);
+
+ /* Simulate TDAC on the concealed audio so that it blends with the
+ MDCT of the next frame. */
+ for (i=0;i<overlap/2;i++)
+ {
+ decode_mem[c][DECODE_BUFFER_SIZE-N+i] =
+ MULT16_32_Q15(mode->window[i], etmp[overlap-1-i])
+ + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]);
+ }
+ } while (++c<CC);
+}
+
+#ifdef ENABLE_DEEP_PLC
+
+#define SINC_ORDER 48
+/* h=cos(pi/2*abs(sin([-24:24]/48*pi*23./24)).^2);
+ b=sinc([-24:24]/3*1.02).*h;
+ b=b/sum(b); */
+static const float sinc_filter[SINC_ORDER+1] = {
+ 4.2931e-05f, -0.000190293f, -0.000816132f, -0.000637162f, 0.00141662f, 0.00354764f, 0.00184368f, -0.00428274f,
+ -0.00856105f, -0.0034003f, 0.00930201f, 0.0159616f, 0.00489785f, -0.0169649f, -0.0259484f, -0.00596856f,
+ 0.0286551f, 0.0405872f, 0.00649994f, -0.0509284f, -0.0716655f, -0.00665212f, 0.134336f, 0.278927f,
+ 0.339995f, 0.278927f, 0.134336f, -0.00665212f, -0.0716655f, -0.0509284f, 0.00649994f, 0.0405872f,
+ 0.0286551f, -0.00596856f, -0.0259484f, -0.0169649f, 0.00489785f, 0.0159616f, 0.00930201f, -0.0034003f,
+ -0.00856105f, -0.00428274f, 0.00184368f, 0.00354764f, 0.00141662f, -0.000637162f, -0.000816132f, -0.000190293f,
+ 4.2931e-05f
+};
+
+void update_plc_state(LPCNetPLCState *lpcnet, celt_sig *decode_mem[2], float *plc_preemphasis_mem, int CC)
+{
+ int i;
+ int tmp_read_post, tmp_fec_skip;
+ int offset;
+ celt_sig buf48k[DECODE_BUFFER_SIZE];
+ opus_int16 buf16k[PLC_UPDATE_SAMPLES];
+ if (CC == 1) OPUS_COPY(buf48k, decode_mem[0], DECODE_BUFFER_SIZE);
+ else {
+ for (i=0;i<DECODE_BUFFER_SIZE;i++) {
+ buf48k[i] = .5*(decode_mem[0][i] + decode_mem[1][i]);
+ }
+ }
+ /* Down-sample the last 40 ms. */
+ for (i=1;i<DECODE_BUFFER_SIZE;i++) buf48k[i] += PREEMPHASIS*buf48k[i-1];
+ *plc_preemphasis_mem = buf48k[DECODE_BUFFER_SIZE-1];
+ offset = DECODE_BUFFER_SIZE-SINC_ORDER-1 - 3*(PLC_UPDATE_SAMPLES-1);
+ celt_assert(3*(PLC_UPDATE_SAMPLES-1) + SINC_ORDER + offset == DECODE_BUFFER_SIZE-1);
+ for (i=0;i<PLC_UPDATE_SAMPLES;i++) {
+ int j;
+ float sum = 0;
+ for (j=0;j<SINC_ORDER+1;j++) {
+ sum += buf48k[3*i + j + offset]*sinc_filter[j];
+ }
+ buf16k[i] = float2int(MIN32(32767.f, MAX32(-32767.f, sum)));
+ }
+ tmp_read_post = lpcnet->fec_read_pos;
+ tmp_fec_skip = lpcnet->fec_skip;
+ for (i=0;i<PLC_UPDATE_FRAMES;i++) {
+ lpcnet_plc_update(lpcnet, &buf16k[FRAME_SIZE*i]);
+ }
+ lpcnet->fec_read_pos = tmp_read_post;
+ lpcnet->fec_skip = tmp_fec_skip;
+}
+#endif
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM
+#ifdef ENABLE_DEEP_PLC
+ ,LPCNetPLCState *lpcnet
+#endif
+ )
{
int c;
int i;
@@ -527,22 +633,22 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
} while (++c<C);
lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C);
- oldBandE = lpc+C*LPC_ORDER;
+ oldBandE = lpc+C*CELT_LPC_ORDER;
oldLogE = oldBandE + 2*nbEBands;
oldLogE2 = oldLogE + 2*nbEBands;
backgroundLogE = oldLogE2 + 2*nbEBands;
loss_duration = st->loss_duration;
start = st->start;
+#ifdef ENABLE_DEEP_PLC
+ noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80));
+#else
noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
+#endif
if (noise_based)
{
/* Noise-based PLC/CNG */
-#ifdef NORM_ALIASING_HACK
- celt_norm *X;
-#else
VARDECL(celt_norm, X);
-#endif
opus_uint32 seed;
int end;
int effEnd;
@@ -550,18 +656,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
end = st->end;
effEnd = IMAX(start, IMIN(end, mode->effEBands));
-#ifdef NORM_ALIASING_HACK
- /* This is an ugly hack that breaks aliasing rules and would be easily broken,
- but it saves almost 4kB of stack. */
- X = (celt_norm*)(out_syn[C-1]+overlap/2);
-#else
ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */
-#endif
c=0; do {
OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
- DECODE_BUFFER_SIZE-N+(overlap>>1));
+ DECODE_BUFFER_SIZE-N+overlap);
} while (++c<C);
+ if (st->prefilter_and_fold) {
+ prefilter_and_fold(st, N);
+ }
+
/* Energy decay */
decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
c=0; do
@@ -590,6 +694,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
st->rng = seed;
celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
+ st->prefilter_and_fold = 0;
+ /* Skip regular PLC until we get two consecutive packets. */
+ st->skip_plc = 1;
} else {
int exc_length;
/* Pitch-based PLC */
@@ -597,12 +704,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
opus_val16 *exc;
opus_val16 fade = Q15ONE;
int pitch_index;
- VARDECL(opus_val32, etmp);
VARDECL(opus_val16, _exc);
VARDECL(opus_val16, fir_tmp);
if (loss_duration == 0)
{
+#ifdef ENABLE_DEEP_PLC
+ if (lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C);
+#endif
st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
} else {
pitch_index = st->last_pitch_index;
@@ -613,10 +722,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
decaying signal, but we can't get more than MAX_PERIOD. */
exc_length = IMIN(2*pitch_index, MAX_PERIOD);
- ALLOC(etmp, overlap, opus_val32);
- ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+ ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16);
ALLOC(fir_tmp, exc_length, opus_val16);
- exc = _exc+LPC_ORDER;
+ exc = _exc+CELT_LPC_ORDER;
window = mode->window;
c=0; do {
opus_val16 decay;
@@ -628,16 +736,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
int j;
buf = decode_mem[c];
- for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
- exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
+ for (i=0;i<MAX_PERIOD+CELT_LPC_ORDER;i++)
+ exc[i-CELT_LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-CELT_LPC_ORDER+i], SIG_SHIFT);
if (loss_duration == 0)
{
- opus_val32 ac[LPC_ORDER+1];
+ opus_val32 ac[CELT_LPC_ORDER+1];
/* Compute LPC coefficients for the last MAX_PERIOD samples before
the first loss so we can work in the excitation-filter domain. */
_celt_autocorr(exc, ac, window, overlap,
- LPC_ORDER, MAX_PERIOD, st->arch);
+ CELT_LPC_ORDER, MAX_PERIOD, st->arch);
/* Add a noise floor of -40 dB. */
#ifdef FIXED_POINT
ac[0] += SHR32(ac[0],13);
@@ -645,7 +753,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
ac[0] *= 1.0001f;
#endif
/* Use lag windowing to stabilize the Levinson-Durbin recursion. */
- for (i=1;i<=LPC_ORDER;i++)
+ for (i=1;i<=CELT_LPC_ORDER;i++)
{
/*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
#ifdef FIXED_POINT
@@ -654,7 +762,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
ac[i] -= ac[i]*(0.008f*0.008f)*i*i;
#endif
}
- _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER);
+ _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER);
#ifdef FIXED_POINT
/* For fixed-point, apply bandwidth expansion until we can guarantee that
no overflow can happen in the IIR filter. This means:
@@ -662,13 +770,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
while (1) {
opus_val16 tmp=Q15ONE;
opus_val32 sum=QCONST16(1., SIG_SHIFT);
- for (i=0;i<LPC_ORDER;i++)
- sum += ABS16(lpc[c*LPC_ORDER+i]);
+ for (i=0;i<CELT_LPC_ORDER;i++)
+ sum += ABS16(lpc[c*CELT_LPC_ORDER+i]);
if (sum < 65535) break;
- for (i=0;i<LPC_ORDER;i++)
+ for (i=0;i<CELT_LPC_ORDER;i++)
{
tmp = MULT16_16_Q15(QCONST16(.99f,15), tmp);
- lpc[c*LPC_ORDER+i] = MULT16_16_Q15(lpc[c*LPC_ORDER+i], tmp);
+ lpc[c*CELT_LPC_ORDER+i] = MULT16_16_Q15(lpc[c*CELT_LPC_ORDER+i], tmp);
}
}
#endif
@@ -678,8 +786,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
{
/* Compute the excitation for exc_length samples before the loss. We need the copy
because celt_fir() cannot filter in-place. */
- celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
- fir_tmp, exc_length, LPC_ORDER, st->arch);
+ celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER,
+ fir_tmp, exc_length, CELT_LPC_ORDER, st->arch);
OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length);
}
@@ -737,15 +845,15 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
S1 += SHR32(MULT16_16(tmp, tmp), 10);
}
{
- opus_val16 lpc_mem[LPC_ORDER];
+ opus_val16 lpc_mem[CELT_LPC_ORDER];
/* Copy the last decoded samples (prior to the overlap region) to
synthesis filter memory so we can have a continuous signal. */
- for (i=0;i<LPC_ORDER;i++)
+ for (i=0;i<CELT_LPC_ORDER;i++)
lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
/* Apply the synthesis filter to convert the excitation back into
the signal domain. */
- celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
- buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
+ celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*CELT_LPC_ORDER,
+ buf+DECODE_BUFFER_SIZE-N, extrapolation_len, CELT_LPC_ORDER,
lpc_mem, st->arch);
#ifdef FIXED_POINT
for (i=0; i < extrapolation_len; i++)
@@ -792,23 +900,65 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
}
}
- /* Apply the pre-filter to the MDCT overlap for the next frame because
- the post-filter will be re-applied in the decoder after the MDCT
- overlap. */
- comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
- st->postfilter_period, st->postfilter_period, overlap,
- -st->postfilter_gain, -st->postfilter_gain,
- st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch);
-
- /* Simulate TDAC on the concealed audio so that it blends with the
- MDCT of the next frame. */
- for (i=0;i<overlap/2;i++)
- {
- buf[DECODE_BUFFER_SIZE+i] =
- MULT16_32_Q15(window[i], etmp[overlap-1-i])
- + MULT16_32_Q15(window[overlap-i-1], etmp[i]);
- }
} while (++c<C);
+
+#ifdef ENABLE_DEEP_PLC
+ if (lpcnet->loaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) {
+ float overlap_mem;
+ int samples_needed16k;
+ celt_sig *buf;
+ VARDECL(float, buf_copy);
+ buf = decode_mem[0];
+ ALLOC(buf_copy, C*overlap, float);
+ c=0; do {
+ OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap);
+ } while (++c<C);
+
+ /* Need enough samples from the PLC to cover the frame size, resampling delay,
+ and the overlap at the end. */
+ samples_needed16k = (N+SINC_ORDER+overlap)/3;
+ if (loss_duration == 0) {
+ st->plc_fill = 0;
+ }
+ while (st->plc_fill < samples_needed16k) {
+ lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]);
+ st->plc_fill += FRAME_SIZE;
+ }
+ /* Resample to 48 kHz. */
+ for (i=0;i<(N+overlap)/3;i++) {
+ int j;
+ float sum;
+ for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j];
+ buf[DECODE_BUFFER_SIZE-N+3*i] = sum;
+ for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2];
+ buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum;
+ for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1];
+ buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum;
+ }
+ OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3);
+ st->plc_fill -= N/3;
+ for (i=0;i<N;i++) {
+ float tmp = buf[DECODE_BUFFER_SIZE-N+i];
+ buf[DECODE_BUFFER_SIZE-N+i] -= PREEMPHASIS*st->plc_preemphasis_mem;
+ st->plc_preemphasis_mem = tmp;
+ }
+ overlap_mem = st->plc_preemphasis_mem;
+ for (i=0;i<overlap;i++) {
+ float tmp = buf[DECODE_BUFFER_SIZE+i];
+ buf[DECODE_BUFFER_SIZE+i] -= PREEMPHASIS*overlap_mem;
+ overlap_mem = tmp;
+ }
+ /* For now, we just do mono PLC. */
+ if (C==2) OPUS_COPY(decode_mem[1], decode_mem[0], DECODE_BUFFER_SIZE+overlap);
+ c=0; do {
+ /* Cross-fade with 48-kHz non-neural PLC for the first 2.5 ms to avoid a discontinuity. */
+ if (loss_duration == 0) {
+ for (i=0;i<overlap;i++) decode_mem[c][DECODE_BUFFER_SIZE-N+i] = (1-window[i])*buf_copy[c*overlap+i] + (window[i])*decode_mem[c][DECODE_BUFFER_SIZE-N+i];
+ }
+ } while (++c<C);
+ }
+#endif
+ st->prefilter_and_fold = 1;
}
/* Saturate to soemthing large to avoid wrap-around. */
@@ -817,18 +967,18 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
RESTORE_STACK;
}
-int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
- int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+ int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+ ,LPCNetPLCState *lpcnet
+#endif
+ )
{
int c, i, N;
int spread_decision;
opus_int32 bits;
ec_dec _dec;
-#ifdef NORM_ALIASING_HACK
- celt_norm *X;
-#else
VARDECL(celt_norm, X);
-#endif
VARDECL(int, fine_quant);
VARDECL(int, pulses);
VARDECL(int, cap);
@@ -881,7 +1031,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
frame_size *= st->downsample;
lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
- oldBandE = lpc+CC*LPC_ORDER;
+ oldBandE = lpc+CC*CELT_LPC_ORDER;
oldLogE = oldBandE + 2*nbEBands;
oldLogE2 = oldLogE + 2*nbEBands;
backgroundLogE = oldLogE2 + 2*nbEBands;
@@ -935,15 +1085,25 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (data == NULL || len<=1)
{
- celt_decode_lost(st, N, LM);
+ celt_decode_lost(st, N, LM
+#ifdef ENABLE_DEEP_PLC
+ , lpcnet
+#endif
+ );
deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
RESTORE_STACK;
return frame_size/st->downsample;
}
+#ifdef ENABLE_DEEP_PLC
+ else {
+ /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */
+ if (lpcnet) lpcnet->blend = 0;
+ }
+#endif
/* Check if there are at least two packets received consecutively before
* turning on the pitch-based PLC */
- st->skip_plc = st->loss_duration != 0;
+ if (st->loss_duration == 0) st->skip_plc = 0;
if (dec == NULL)
{
@@ -1006,6 +1166,36 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
/* Decode the global flags (first symbols in the stream) */
intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
+ /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the
+ risk of getting loud artifacts. */
+ if (!intra_ener && st->loss_duration != 0) {
+ c=0; do
+ {
+ opus_val16 safety = 0;
+ int missing = IMIN(10, st->loss_duration>>LM);
+ if (LM==0) safety = QCONST16(1.5f,DB_SHIFT);
+ else if (LM==1) safety = QCONST16(.5f,DB_SHIFT);
+ for (i=start;i<end;i++)
+ {
+ if (oldBandE[c*nbEBands+i] < MAX16(oldLogE[c*nbEBands+i], oldLogE2[c*nbEBands+i])) {
+ /* If energy is going down already, continue the trend. */
+ opus_val32 slope;
+ opus_val32 E0, E1, E2;
+ E0 = oldBandE[c*nbEBands+i];
+ E1 = oldLogE[c*nbEBands+i];
+ E2 = oldLogE2[c*nbEBands+i];
+ slope = MAX32(E1 - E0, HALF32(E2 - E0));
+ E0 -= MAX32(0, (1+missing)*slope);
+ oldBandE[c*nbEBands+i] = MAX32(-QCONST16(20.f,DB_SHIFT), E0);
+ } else {
+ /* Otherwise take the min of the last frames. */
+ oldBandE[c*nbEBands+i] = MIN16(MIN16(oldBandE[c*nbEBands+i], oldLogE[c*nbEBands+i]), oldLogE2[c*nbEBands+i]);
+ }
+ /* Shorter frames have more natural fluctuations -- play it safe. */
+ oldBandE[c*nbEBands+i] -= safety;
+ }
+ } while (++c<2);
+ }
/* Get band energies */
unquant_coarse_energy(mode, start, end, oldBandE,
intra_ener, dec, C, LM);
@@ -1073,19 +1263,13 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C);
c=0; do {
- OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+ OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap);
} while (++c<CC);
/* Decode fixed codebook */
ALLOC(collapse_masks, C*nbEBands, unsigned char);
-#ifdef NORM_ALIASING_HACK
- /* This is an ugly hack that breaks aliasing rules and would be easily broken,
- but it saves almost 4kB of stack. */
- X = (celt_norm*)(out_syn[CC-1]+overlap/2);
-#else
ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */
-#endif
quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
@@ -1109,7 +1293,9 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
for (i=0;i<C*nbEBands;i++)
oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
}
-
+ if (st->prefilter_and_fold) {
+ prefilter_and_fold(st, N);
+ }
celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd,
C, CC, isTransient, LM, st->downsample, silence, st->arch);
@@ -1173,6 +1359,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
st->loss_duration = 0;
+ st->prefilter_and_fold = 0;
RESTORE_STACK;
if (ec_tell(dec) > 8*len)
return OPUS_INTERNAL_ERROR;
@@ -1181,6 +1368,15 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
return frame_size/st->downsample;
}
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+ int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+{
+ return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum
+#ifdef ENABLE_DEEP_PLC
+ , NULL
+#endif
+ );
+}
#ifdef CUSTOM_MODES
@@ -1254,6 +1450,26 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
va_start(ap, request);
switch (request)
{
+ case OPUS_SET_COMPLEXITY_REQUEST:
+ {
+ opus_int32 value = va_arg(ap, opus_int32);
+ if(value<0 || value>10)
+ {
+ goto bad_arg;
+ }
+ st->complexity = value;
+ }
+ break;
+ case OPUS_GET_COMPLEXITY_REQUEST:
+ {
+ opus_int32 *value = va_arg(ap, opus_int32*);
+ if (!value)
+ {
+ goto bad_arg;
+ }
+ *value = st->complexity;
+ }
+ break;
case CELT_SET_START_BAND_REQUEST:
{
opus_int32 value = va_arg(ap, opus_int32);
@@ -1300,7 +1516,7 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
int i;
opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2;
lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
- oldBandE = lpc+st->channels*LPC_ORDER;
+ oldBandE = lpc+st->channels*CELT_LPC_ORDER;
oldLogE = oldBandE + 2*st->mode->nbEBands;
oldLogE2 = oldLogE + 2*st->mode->nbEBands;
OPUS_CLEAR((char*)&st->DECODER_RESET_START,
diff --git a/media/libopus/celt/celt_encoder.c b/media/libopus/celt/celt_encoder.c
index 637d442cf7..7f32a801c6 100644
--- a/media/libopus/celt/celt_encoder.c
+++ b/media/libopus/celt/celt_encoder.c
@@ -281,6 +281,9 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
/* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
for (i=0;i<len;i++)
{
+#ifndef FIXED_POINT
+ float mem00;
+#endif
opus_val32 x,y;
x = SHR32(in[i+c*len],SIG_SHIFT);
y = ADD32(mem0, x);
@@ -288,8 +291,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
mem0 = mem1 + y - SHL32(x,1);
mem1 = x - SHR32(y,1);
#else
+ /* Original code:
mem0 = mem1 + y - 2*x;
mem1 = x - .5f*y;
+ Modified code to shorten dependency chains: */
+ mem00=mem0;
+ mem0 = mem0 - x + .5f*mem1;
+ mem1 = x - mem00;
#endif
tmp[i] = SROUND16(y, 2);
/*printf("%f ", tmp[i]);*/
@@ -322,10 +330,11 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
#ifdef FIXED_POINT
/* FIXME: Use PSHR16() instead */
tmp[i] = mem0 + PSHR32(x2-mem0,forward_shift);
+ mem0 = tmp[i];
#else
- tmp[i] = mem0 + MULT16_16_P15(forward_decay,x2-mem0);
+ mem0 = x2 + (1.f-forward_decay)*mem0;
+ tmp[i] = forward_decay*mem0;
#endif
- mem0 = tmp[i];
}
mem0=0;
@@ -337,11 +346,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
#ifdef FIXED_POINT
/* FIXME: Use PSHR16() instead */
tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3);
-#else
- tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0);
-#endif
mem0 = tmp[i];
maxE = MAX16(maxE, mem0);
+#else
+ mem0 = tmp[i] + 0.875f*mem0;
+ tmp[i] = 0.125f*mem0;
+ maxE = MAX16(maxE, 0.125f*mem0);
+#endif
}
/*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/
@@ -967,7 +978,7 @@ static opus_val16 median_of_3(const opus_val16 *x)
return t0;
}
-static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
+static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2, const opus_val16 *oldBandE,
int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc,
@@ -978,9 +989,11 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
opus_val16 maxDepth;
VARDECL(opus_val16, follower);
VARDECL(opus_val16, noise_floor);
+ VARDECL(opus_val16, bandLogE3);
SAVE_STACK;
ALLOC(follower, C*nbEBands, opus_val16);
ALLOC(noise_floor, C*nbEBands, opus_val16);
+ ALLOC(bandLogE3, nbEBands, opus_val16);
OPUS_CLEAR(offsets, nbEBands);
/* Dynamic allocation code */
maxDepth=-QCONST16(31.9f, DB_SHIFT);
@@ -1033,8 +1046,10 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
printf("%d ", spread_weight[i]);
printf("\n");*/
}
- /* Make sure that dynamic allocation can't make us bust the budget */
- if (effectiveBytes > 50 && LM>=1 && !lfe)
+ /* Make sure that dynamic allocation can't make us bust the budget.
+ We enable the feature starting at 24 kb/s for 20-ms frames
+ and 96 kb/s for 2.5 ms frames. */
+ if (effectiveBytes >= (30 + 5*LM) && !lfe)
{
int last=0;
c=0;do
@@ -1042,30 +1057,38 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
opus_val16 offset;
opus_val16 tmp;
opus_val16 *f;
+ OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end);
+ if (LM==0) {
+ /* For 2.5 ms frames, the first 8 bands have just one bin, so the
+ energy is highly unreliable (high variance). For that reason,
+ we take the max with the previous energy so that at least 2 bins
+ are getting used. */
+ for (i=0;i<IMIN(8,end);i++) bandLogE3[i] = MAX16(bandLogE2[c*nbEBands+i], oldBandE[c*nbEBands+i]);
+ }
f = &follower[c*nbEBands];
- f[0] = bandLogE2[c*nbEBands];
+ f[0] = bandLogE3[0];
for (i=1;i<end;i++)
{
/* The last band to be at least 3 dB higher than the previous one
is the last we'll consider. Otherwise, we run into problems on
bandlimited signals. */
- if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT))
+ if (bandLogE3[i] > bandLogE3[i-1]+QCONST16(.5f,DB_SHIFT))
last=i;
- f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
+ f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE3[i]);
}
for (i=last-1;i>=0;i--)
- f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+ f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE3[i]));
/* Combine with a median filter to avoid dynalloc triggering unnecessarily.
The "offset" value controls how conservative we are -- a higher offset
reduces the impact of the median filter and makes dynalloc use more bits. */
offset = QCONST16(1.f, DB_SHIFT);
for (i=2;i<end-2;i++)
- f[i] = MAX16(f[i], median_of_5(&bandLogE2[c*nbEBands+i-2])-offset);
- tmp = median_of_3(&bandLogE2[c*nbEBands])-offset;
+ f[i] = MAX16(f[i], median_of_5(&bandLogE3[i-2])-offset);
+ tmp = median_of_3(&bandLogE3[0])-offset;
f[0] = MAX16(f[0], tmp);
f[1] = MAX16(f[1], tmp);
- tmp = median_of_3(&bandLogE2[c*nbEBands+end-3])-offset;
+ tmp = median_of_3(&bandLogE3[end-3])-offset;
f[end-2] = MAX16(f[end-2], tmp);
f[end-1] = MAX16(f[end-1], tmp);
@@ -1565,10 +1588,13 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
vbr_rate = 0;
tmp = st->bitrate*frame_size;
if (tell>1)
- tmp += tell;
+ tmp += tell*mode->Fs;
if (st->bitrate!=OPUS_BITRATE_MAX)
+ {
nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
(tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
+ ec_enc_shrink(enc, nbCompressedBytes);
+ }
effectiveBytes = nbCompressedBytes - nbFilledBytes;
}
equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50);
@@ -1882,7 +1908,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
ALLOC(importance, nbEBands, int);
ALLOC(spread_weight, nbEBands, int);
- maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets,
+ maxDepth = dynalloc_analysis(bandLogE, bandLogE2, oldBandE, nbEBands, start, end, C, offsets,
st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight);
@@ -2246,7 +2272,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
if (anti_collapse_on)
{
anti_collapse(mode, X, collapse_masks, LM, C, N,
- start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+ start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
}
c=0; do {
@@ -2265,15 +2291,15 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
- mode->window, overlap);
+ mode->window, overlap, st->arch);
if (LM!=0)
comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
- mode->window, overlap);
+ mode->window, overlap, st->arch);
} while (++c<CC);
/* We reuse freq[] as scratch space for the de-emphasis */
- deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD);
+ deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0);
st->prefilter_period_old = st->prefilter_period;
st->prefilter_gain_old = st->prefilter_gain;
st->prefilter_tapset_old = st->prefilter_tapset;
diff --git a/media/libopus/celt/celt_lpc.c b/media/libopus/celt/celt_lpc.c
index f91721bcab..fabca65cb3 100644
--- a/media/libopus/celt/celt_lpc.c
+++ b/media/libopus/celt/celt_lpc.c
@@ -44,7 +44,7 @@ int p
opus_val32 r;
opus_val32 error = ac[0];
#ifdef FIXED_POINT
- opus_val32 lpc[LPC_ORDER];
+ opus_val32 lpc[CELT_LPC_ORDER];
#else
float *lpc = _lpc;
#endif
@@ -158,7 +158,17 @@ void celt_fir_c(
sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT);
sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
- xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+ {
+ opus_val32 sum_c[4];
+ memcpy(sum_c, sum, sizeof(sum_c));
+ xcorr_kernel_c(rnum, x+i-ord, sum_c, ord);
+#endif
+ xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+ celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+ }
+#endif
y[i ] = SROUND16(sum[0], SIG_SHIFT);
y[i+1] = SROUND16(sum[1], SIG_SHIFT);
y[i+2] = SROUND16(sum[2], SIG_SHIFT);
@@ -222,8 +232,17 @@ void celt_iir(const opus_val32 *_x,
sum[1]=_x[i+1];
sum[2]=_x[i+2];
sum[3]=_x[i+3];
- xcorr_kernel(rden, y+i, sum, ord, arch);
-
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+ {
+ opus_val32 sum_c[4];
+ memcpy(sum_c, sum, sizeof(sum_c));
+ xcorr_kernel_c(rden, y+i, sum_c, ord);
+#endif
+ xcorr_kernel(rden, y+i, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+ celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+ }
+#endif
/* Patch up the result to compensate for the fact that this is an IIR */
y[i+ord ] = -SROUND16(sum[0],SIG_SHIFT);
_y[i ] = sum[0];
diff --git a/media/libopus/celt/celt_lpc.h b/media/libopus/celt/celt_lpc.h
index a4c5fd6ea5..97dee82f02 100644
--- a/media/libopus/celt/celt_lpc.h
+++ b/media/libopus/celt/celt_lpc.h
@@ -35,7 +35,7 @@
#include "x86/celt_lpc_sse.h"
#endif
-#define LPC_ORDER 24
+#define CELT_LPC_ORDER 24
void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
diff --git a/media/libopus/celt/cpu_support.h b/media/libopus/celt/cpu_support.h
index 7b5c56ca90..9f13d8aecf 100644
--- a/media/libopus/celt/cpu_support.h
+++ b/media/libopus/celt/cpu_support.h
@@ -35,19 +35,20 @@
(defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
#include "arm/armcpu.h"
-/* We currently support 4 ARM variants:
+/* We currently support 5 ARM variants:
* arch[0] -> ARMv4
* arch[1] -> ARMv5E
* arch[2] -> ARMv6
* arch[3] -> NEON
+ * arch[4] -> NEON+DOTPROD
*/
-#define OPUS_ARCHMASK 3
+#define OPUS_ARCHMASK 7
#elif defined(OPUS_HAVE_RTCD) && \
((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
+ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
#include "x86/x86cpu.h"
/* We currently support 5 x86 variants:
diff --git a/media/libopus/celt/entdec.c b/media/libopus/celt/entdec.c
index 0b3433ed8b..027aa24bca 100644
--- a/media/libopus/celt/entdec.c
+++ b/media/libopus/celt/entdec.c
@@ -195,6 +195,27 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){
return ret;
}
+int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb){
+ opus_uint32 r;
+ opus_uint32 d;
+ opus_uint32 s;
+ opus_uint32 t;
+ int ret;
+ s=_this->rng;
+ d=_this->val;
+ r=s>>_ftb;
+ ret=-1;
+ do{
+ t=s;
+ s=IMUL32(r,_icdf[++ret]);
+ }
+ while(d<s);
+ _this->val=d-s;
+ _this->rng=t-s;
+ ec_dec_normalize(_this);
+ return ret;
+}
+
opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){
unsigned ft;
unsigned s;
diff --git a/media/libopus/celt/entdec.h b/media/libopus/celt/entdec.h
index 025fc1870d..c81f26fdb2 100644
--- a/media/libopus/celt/entdec.h
+++ b/media/libopus/celt/entdec.h
@@ -81,6 +81,16 @@ int ec_dec_bit_logp(ec_dec *_this,unsigned _logp);
Return: The decoded symbol s.*/
int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb);
+/*Decodes a symbol given an "inverse" CDF table.
+ No call to ec_dec_update() is necessary after this call.
+ _icdf: The "inverse" CDF, such that symbol s falls in the range
+ [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb.
+ The values must be monotonically non-increasing, and the last value
+ must be 0.
+ _ftb: The number of bits of precision in the cumulative distribution.
+ Return: The decoded symbol s.*/
+int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb);
+
/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
The bits must have been encoded with ec_enc_uint().
No call to ec_dec_update() is necessary after this call.
diff --git a/media/libopus/celt/entenc.c b/media/libopus/celt/entenc.c
index f1750d25b8..69c6f835d0 100644
--- a/media/libopus/celt/entenc.c
+++ b/media/libopus/celt/entenc.c
@@ -172,6 +172,17 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){
ec_enc_normalize(_this);
}
+void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb){
+ opus_uint32 r;
+ r=_this->rng>>_ftb;
+ if(_s>0){
+ _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]);
+ _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]);
+ }
+ else _this->rng-=IMUL32(r,_icdf[_s]);
+ ec_enc_normalize(_this);
+}
+
void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){
unsigned ft;
unsigned fl;
diff --git a/media/libopus/celt/entenc.h b/media/libopus/celt/entenc.h
index f502eaf662..010874bbc1 100644
--- a/media/libopus/celt/entenc.h
+++ b/media/libopus/celt/entenc.h
@@ -64,6 +64,15 @@ void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp);
_ftb: The number of bits of precision in the cumulative distribution.*/
void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb);
+/*Encodes a symbol given an "inverse" CDF table.
+ _s: The index of the symbol to encode.
+ _icdf: The "inverse" CDF, such that symbol _s falls in the range
+ [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb.
+ The values must be monotonically non-increasing, and the last value
+ must be 0.
+ _ftb: The number of bits of precision in the cumulative distribution.*/
+void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb);
+
/*Encodes a raw unsigned integer in the stream.
_fl: The integer to encode.
_ft: The number of integers that can be encoded (one more than the max).
diff --git a/media/libopus/celt/laplace.c b/media/libopus/celt/laplace.c
index a7bca874b6..2180966662 100644
--- a/media/libopus/celt/laplace.c
+++ b/media/libopus/celt/laplace.c
@@ -132,3 +132,104 @@ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay)
ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768);
return val;
}
+
+void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay)
+{
+ int s;
+ opus_uint16 sign_icdf[3];
+ sign_icdf[0] = 32768-p0;
+ sign_icdf[1] = sign_icdf[0]/2;
+ sign_icdf[2] = 0;
+ s = value == 0 ? 0 : (value > 0 ? 1 : 2);
+ ec_enc_icdf16(enc, s, sign_icdf, 15);
+ value = abs(value);
+ if (value)
+ {
+ int i;
+ opus_uint16 icdf[8];
+ icdf[0] = IMAX(7, decay);
+ for (i=1;i<7;i++)
+ {
+ icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15);
+ }
+ icdf[7] = 0;
+ value--;
+ do {
+ ec_enc_icdf16(enc, IMIN(value, 7), icdf, 15);
+ value -= 7;
+ } while (value >= 0);
+ }
+}
+
+int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay)
+{
+ int s;
+ int value;
+ opus_uint16 sign_icdf[3];
+ sign_icdf[0] = 32768-p0;
+ sign_icdf[1] = sign_icdf[0]/2;
+ sign_icdf[2] = 0;
+ s = ec_dec_icdf16(dec, sign_icdf, 15);
+ if (s==2) s = -1;
+ if (s != 0)
+ {
+ int i;
+ int v;
+ opus_uint16 icdf[8];
+ icdf[0] = IMAX(7, decay);
+ for (i=1;i<7;i++)
+ {
+ icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15);
+ }
+ icdf[7] = 0;
+ value = 1;
+ do {
+ v = ec_dec_icdf16(dec, icdf, 15);
+ value += v;
+ } while (v == 7);
+ return s*value;
+ } else return 0;
+}
+
+#if 0
+
+#include <stdio.h>
+#define NB_VALS 10
+#define DATA_SIZE 10000
+int main() {
+ ec_enc enc;
+ ec_dec dec;
+ unsigned char *ptr;
+ int i;
+ int decay, p0;
+ int val[NB_VALS] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ /*for (i=0;i<NB_VALS;i++) {
+ val[i] = -log(rand()/(float)RAND_MAX);
+ if (rand()%2) val[i] = -val[i];
+ }*/
+ p0 = 16000;
+ decay = 16000;
+ ptr = (unsigned char *)malloc(DATA_SIZE);
+ ec_enc_init(&enc,ptr,DATA_SIZE);
+ for (i=0;i<NB_VALS;i++) {
+ printf("%d ", val[i]);
+ }
+ printf("\n");
+ for (i=0;i<NB_VALS;i++) {
+ ec_laplace_encode_p0(&enc, val[i], p0, decay);
+ }
+
+ ec_enc_done(&enc);
+
+ ec_dec_init(&dec,ec_get_buffer(&enc),ec_range_bytes(&enc));
+
+ for (i=0;i<NB_VALS;i++) {
+ val[i] = ec_laplace_decode_p0(&dec, p0, decay);
+ }
+ for (i=0;i<NB_VALS;i++) {
+ printf("%d ", val[i]);
+ }
+ printf("\n");
+}
+
+#endif
diff --git a/media/libopus/celt/laplace.h b/media/libopus/celt/laplace.h
index 46c14b5da5..8010ad9755 100644
--- a/media/libopus/celt/laplace.h
+++ b/media/libopus/celt/laplace.h
@@ -26,6 +26,9 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#ifndef LAPLACE_H
+#define LAPLACE_H
+
#include "entenc.h"
#include "entdec.h"
@@ -46,3 +49,9 @@ void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay);
@return Value decoded
*/
int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay);
+
+
+int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay);
+void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay);
+
+#endif
diff --git a/media/libopus/celt/mathops.h b/media/libopus/celt/mathops.h
index 478ac9187c..e2eece2937 100644
--- a/media/libopus/celt/mathops.h
+++ b/media/libopus/celt/mathops.h
@@ -230,6 +230,12 @@ static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x)
frac = SHL16(x, 4);
return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
}
+
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+
/** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */
static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x)
{
diff --git a/media/libopus/celt/mips/celt_mipsr1.h b/media/libopus/celt/mips/celt_mipsr1.h
index c332fe0471..d1b25c204d 100644
--- a/media/libopus/celt/mips/celt_mipsr1.h
+++ b/media/libopus/celt/mips/celt_mipsr1.h
@@ -27,8 +27,8 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __CELT_MIPSR1_H__
-#define __CELT_MIPSR1_H__
+#ifndef CELT_MIPSR1_H__
+#define CELT_MIPSR1_H__
#ifdef HAVE_CONFIG_H
#include "config.h"
@@ -149,4 +149,4 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
}
}
-#endif /* __CELT_MIPSR1_H__ */
+#endif /* CELT_MIPSR1_H__ */
diff --git a/media/libopus/celt/mips/mdct_mipsr1.h b/media/libopus/celt/mips/mdct_mipsr1.h
index 2934dab776..7456c181a5 100644
--- a/media/libopus/celt/mips/mdct_mipsr1.h
+++ b/media/libopus/celt/mips/mdct_mipsr1.h
@@ -38,8 +38,8 @@
MDCT implementation in FFMPEG, but has differences in signs, ordering
and scaling in many places.
*/
-#ifndef __MDCT_MIPSR1_H__
-#define __MDCT_MIPSR1_H__
+#ifndef MDCT_MIPSR1_H__
+#define MDCT_MIPSR1_H__
#ifndef SKIP_CONFIG_H
#ifdef HAVE_CONFIG_H
@@ -285,4 +285,4 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
}
}
}
-#endif /* __MDCT_MIPSR1_H__ */
+#endif /* MDCT_MIPSR1_H__ */
diff --git a/media/libopus/celt/mips/vq_mipsr1.h b/media/libopus/celt/mips/vq_mipsr1.h
index f26a33e755..1621c5624f 100644
--- a/media/libopus/celt/mips/vq_mipsr1.h
+++ b/media/libopus/celt/mips/vq_mipsr1.h
@@ -26,8 +26,8 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __VQ_MIPSR1_H__
-#define __VQ_MIPSR1_H__
+#ifndef VQ_MIPSR1_H__
+#define VQ_MIPSR1_H__
#ifdef HAVE_CONFIG_H
#include "config.h"
@@ -113,4 +113,4 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
/*return celt_sqrt(E);*/
}
-#endif /* __VQ_MIPSR1_H__ */
+#endif /* VQ_MIPSR1_H__ */
diff --git a/media/libopus/celt/os_support.h b/media/libopus/celt/os_support.h
index 009bf861da..7d2d378116 100644
--- a/media/libopus/celt/os_support.h
+++ b/media/libopus/celt/os_support.h
@@ -41,7 +41,7 @@
#include <string.h>
#include <stdlib.h>
-/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */
+/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
#ifndef OVERRIDE_OPUS_ALLOC
static OPUS_INLINE void *opus_alloc (size_t size)
{
@@ -49,7 +49,15 @@ static OPUS_INLINE void *opus_alloc (size_t size)
}
#endif
-/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
+#ifndef OVERRIDE_OPUS_REALLOC
+static OPUS_INLINE void *opus_realloc (void *ptr, size_t size)
+{
+ return realloc(ptr, size);
+}
+#endif
+
+/** Used only for non-threadsafe pseudostack.
+ If desired, this can always return the same area of memory rather than allocating a new one every time. */
#ifndef OVERRIDE_OPUS_ALLOC_SCRATCH
static OPUS_INLINE void *opus_alloc_scratch (size_t size)
{
@@ -58,7 +66,7 @@ static OPUS_INLINE void *opus_alloc_scratch (size_t size)
}
#endif
-/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */
+/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
#ifndef OVERRIDE_OPUS_FREE
static OPUS_INLINE void opus_free (void *ptr)
{
diff --git a/media/libopus/celt/pitch.c b/media/libopus/celt/pitch.c
index 7998db4164..e33c60a3bf 100644
--- a/media/libopus/celt/pitch.c
+++ b/media/libopus/celt/pitch.c
@@ -262,7 +262,16 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
- xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+ {
+ opus_val32 sum_c[4]={0,0,0,0};
+ xcorr_kernel_c(_x, _y+i, sum_c, len);
+#endif
+ xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+ celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+ }
+#endif
xcorr[i]=sum[0];
xcorr[i+1]=sum[1];
xcorr[i+2]=sum[2];
diff --git a/media/libopus/celt/pitch.h b/media/libopus/celt/pitch.h
index e425f56aea..dd0e2bebd2 100644
--- a/media/libopus/celt/pitch.h
+++ b/media/libopus/celt/pitch.h
@@ -189,4 +189,15 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
# define celt_pitch_xcorr celt_pitch_xcorr_c
#endif
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+ opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+ ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
+#endif
+
+
#endif
diff --git a/media/libopus/celt/stack_alloc.h b/media/libopus/celt/stack_alloc.h
index ae40e2a165..e2739bdf66 100644
--- a/media/libopus/celt/stack_alloc.h
+++ b/media/libopus/celt/stack_alloc.h
@@ -141,7 +141,7 @@ extern char *global_stack_top;
#else
#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
-#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
+#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/(sizeof(char))),(stack)+=(size)*(sizeof(type)/(sizeof(char))),(type*)((stack)-(size)*(sizeof(type)/(sizeof(char)))))
#if 0 /* Set this to 1 to instrument pseudostack usage */
#define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack)
#else
diff --git a/media/libopus/celt/x86/celt_lpc_sse4_1.c b/media/libopus/celt/x86/celt_lpc_sse4_1.c
index 5478568849..daf59d245a 100644
--- a/media/libopus/celt/x86/celt_lpc_sse4_1.c
+++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c
@@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x,
{
opus_val32 sums[4] = {0};
__m128i vecSum, vecX;
-
- xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
-
+#if defined(OPUS_CHECK_ASM)
+ {
+ opus_val32 sums_c[4] = {0};
+ xcorr_kernel_c(rnum, x+i-ord, sums_c, ord);
+#endif
+ xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+#if defined(OPUS_CHECK_ASM)
+ celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0);
+ }
+#endif
vecSum = _mm_loadu_si128((__m128i *)sums);
vecSum = _mm_add_epi32(vecSum, vecNoA);
vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
diff --git a/media/libopus/celt/x86/pitch_avx.c b/media/libopus/celt/x86/pitch_avx.c
new file mode 100644
index 0000000000..f731762d84
--- /dev/null
+++ b/media/libopus/celt/x86/pitch_avx.c
@@ -0,0 +1,101 @@
+/* Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <immintrin.h>
+#include "x86cpu.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT)
+
+/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
+static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
+{
+ __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
+ xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
+ int i;
+ __m256 x0;
+ /* Compute 8 inner products using partial sums. */
+ for (i=0;i<len-7;i+=8)
+ {
+ x0 = _mm256_loadu_ps(x+i);
+ xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i ), xsum0);
+ xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
+ xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
+ xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
+ xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
+ xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
+ xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
+ xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
+ }
+ if (i != len) {
+ static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
+ __m256i m;
+ m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len));
+ x0 = _mm256_maskload_ps(x+i, m);
+ xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i , m), xsum0);
+ xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
+ xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
+ xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
+ xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
+ xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
+ xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
+ xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
+ }
+ /* 8 horizontal adds. */
+ /* Compute [0 4] [1 5] [2 6] [3 7] */
+ xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
+ xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
+ xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
+ xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
+ /* Compute [0 1 4 5] [2 3 6 7] */
+ xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+ xsum1 = _mm256_hadd_ps(xsum2, xsum3);
+ /* Compute [0 1 2 3 4 5 6 7] */
+ xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+ _mm256_storeu_ps(sum, xsum0);
+}
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
+{
+ int i;
+ celt_assert(max_pitch>0);
+ (void)arch;
+ for (i=0;i<max_pitch-7;i+=8)
+ {
+ xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
+ }
+ for (;i<max_pitch;i++)
+ {
+ xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
+ }
+}
+
+#endif
diff --git a/media/libopus/celt/x86/pitch_sse.h b/media/libopus/celt/x86/pitch_sse.h
index 964aef50db..127581f3e1 100644
--- a/media/libopus/celt/x86/pitch_sse.h
+++ b/media/libopus/celt/x86/pitch_sse.h
@@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
-#define OVERRIDE_DUAL_INNER_PROD
-#define OVERRIDE_COMB_FILTER_CONST
-
-#undef dual_inner_prod
-#undef comb_filter_const
-
void dual_inner_prod_sse(const opus_val16 *x,
const opus_val16 *y01,
const opus_val16 *y02,
@@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y,
#if defined(OPUS_X86_PRESUME_SSE)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
-#else
+#elif defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y01,
@@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
#define NON_STATIC_COMB_FILTER_CONST_C
#endif
-#endif
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch);
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_avx2
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+ const float *_x,
+ const float *_y,
+ float *xcorr,
+ int len,
+ int max_pitch,
+ int arch
+ );
+
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch))
+
+
+#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */
+
+#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */
#endif
diff --git a/media/libopus/celt/x86/vq_sse.h b/media/libopus/celt/x86/vq_sse.h
index b4efe8f249..444503b630 100644
--- a/media/libopus/celt/x86/vq_sse.h
+++ b/media/libopus/celt/x86/vq_sse.h
@@ -28,16 +28,18 @@
#define VQ_SSE_H
#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
-#define OVERRIDE_OP_PVQ_SEARCH
opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
#if defined(OPUS_X86_PRESUME_SSE2)
+
+#define OVERRIDE_OP_PVQ_SEARCH
#define op_pvq_search(x, iy, K, N, arch) \
(op_pvq_search_sse2(x, iy, K, N, arch))
-#else
+#elif defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_OP_PVQ_SEARCH
extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
celt_norm *_X, int *iy, int K, int N, int arch);
diff --git a/media/libopus/celt/x86/vq_sse2.c b/media/libopus/celt/x86/vq_sse2.c
index 775042860d..4c4ebf8e2d 100644
--- a/media/libopus/celt/x86/vq_sse2.c
+++ b/media/libopus/celt/x86/vq_sse2.c
@@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
sums = _mm_add_ps(sums, x4);
/* Clear y and iy in case we don't do the projection. */
_mm_storeu_ps(&y[j], _mm_setzero_ps());
- _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+ _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
_mm_storeu_ps(&X[j], x4);
_mm_storeu_ps(&signy[j], s4);
}
@@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
rx4 = _mm_mul_ps(x4, rcp4);
iy4 = _mm_cvttps_epi32(rx4);
pulses_sum = _mm_add_epi32(pulses_sum, iy4);
- _mm_storeu_si128((__m128i*)&iy[j], iy4);
+ _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
y4 = _mm_cvtepi32_ps(iy4);
xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
@@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
{
__m128i y4;
__m128i s4;
- y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+ y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
- _mm_storeu_si128((__m128i*)&iy[j], y4);
+ _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
}
RESTORE_STACK;
return yy;
diff --git a/media/libopus/celt/x86/x86_arch_macros.h b/media/libopus/celt/x86/x86_arch_macros.h
new file mode 100644
index 0000000000..975b443e93
--- /dev/null
+++ b/media/libopus/celt/x86/x86_arch_macros.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+# ifndef __SSE__
+# define __SSE__
+# endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE2
+# ifndef __SSE2__
+# define __SSE2__
+# endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE4_1
+# ifndef __SSE4_1__
+# define __SSE4_1__
+# endif
+# endif
+
+#endif
diff --git a/media/libopus/celt/x86/x86_celt_map.c b/media/libopus/celt/x86/x86_celt_map.c
index d39d88edec..ba8eafe6ad 100644
--- a/media/libopus/celt/x86/x86_celt_map.c
+++ b/media/libopus/celt/x86/x86_celt_map.c
@@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
# else
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)
+
+void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+ const float *_x,
+ const float *_y,
+ float *xcorr,
+ int len,
+ int max_pitch,
+ int arch
+) = {
+ celt_pitch_xcorr_c, /* non-sse */
+ celt_pitch_xcorr_c,
+ celt_pitch_xcorr_c,
+ celt_pitch_xcorr_c,
+ MAY_HAVE_AVX2(celt_pitch_xcorr)
+};
+
+#endif
+
+
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
diff --git a/media/libopus/celt/x86/x86cpu.c b/media/libopus/celt/x86/x86cpu.c
index 6a1914dee7..2e7c32aeec 100644
--- a/media/libopus/celt/x86/x86cpu.c
+++ b/media/libopus/celt/x86/x86cpu.c
@@ -39,7 +39,7 @@
((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
+ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
#if defined(_MSC_VER)
@@ -105,7 +105,7 @@ typedef struct CPU_Feature{
int HW_SSE2;
int HW_SSE41;
/* SIMD: 256-bit */
- int HW_AVX;
+ int HW_AVX2;
} CPU_Feature;
static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
@@ -121,13 +121,19 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
- cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+ cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0;
+ if (cpu_feature->HW_AVX2 && nIds >= 7) {
+ cpuid(info, 7);
+ cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0;
+ } else {
+ cpu_feature->HW_AVX2 = 0;
+ }
}
else {
cpu_feature->HW_SSE = 0;
cpu_feature->HW_SSE2 = 0;
cpu_feature->HW_SSE41 = 0;
- cpu_feature->HW_AVX = 0;
+ cpu_feature->HW_AVX2 = 0;
}
}
@@ -157,7 +163,7 @@ static int opus_select_arch_impl(void)
}
arch++;
- if (!cpu_feature.HW_AVX)
+ if (!cpu_feature.HW_AVX2)
{
return arch;
}
diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h
index 04e80489b1..8ae9be8d8f 100644
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@@ -46,28 +46,53 @@
# define MAY_HAVE_SSE4_1(name) name ## _c
# endif
-# if defined(OPUS_X86_MAY_HAVE_AVX)
-# define MAY_HAVE_AVX(name) name ## _avx
+# if defined(OPUS_X86_MAY_HAVE_AVX2)
+# define MAY_HAVE_AVX2(name) name ## _avx2
# else
-# define MAY_HAVE_AVX(name) name ## _c
+# define MAY_HAVE_AVX2(name) name ## _c
# endif
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
int opus_select_arch(void);
# endif
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+# include "opus_defines.h"
+
/*MOVD should not impose any alignment restrictions, but the C standard does,
and UBSan will report errors if we actually make unaligned accesses.
Use this to work around those restrictions (which should hopefully all get
- optimized to a single MOVD instruction).*/
-#define OP_LOADU_EPI32(x) \
- (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
- *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+ optimized to a single MOVD instruction).
+ GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
+# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+# include <string.h>
+# include <emmintrin.h>
+
+# ifdef _mm_loadu_si32
+# undef _mm_loadu_si32
+# endif
+# define _mm_loadu_si32 WORKAROUND_mm_loadu_si32
+static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) {
+ int val;
+ memcpy(&val, mem_addr, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+# elif defined(_MSC_VER)
+ /* MSVC needs this for _mm_loadu_si32 */
+# include <immintrin.h>
+# endif
-#define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+# define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_loadu_si32(x)))
-#define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x))))
+
+# endif
#endif