From 086c044dc34dfc0f74fbe41f4ecb402b2cd34884 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:13:33 +0200 Subject: Merging upstream version 125.0.1. Signed-off-by: Daniel Baumann --- media/libopus/celt/arm/arm_celt_map.c | 31 ++- media/libopus/celt/arm/armcpu.c | 51 ++++- media/libopus/celt/arm/armcpu.h | 13 ++ media/libopus/celt/arm/celt_neon_intr.c | 83 ++++++- media/libopus/celt/arm/pitch_neon_intr.c | 7 + media/libopus/celt/celt.h | 25 +-- media/libopus/celt/celt_decoder.c | 360 ++++++++++++++++++++++++------- media/libopus/celt/celt_encoder.c | 68 ++++-- media/libopus/celt/celt_lpc.c | 27 ++- media/libopus/celt/celt_lpc.h | 2 +- media/libopus/celt/cpu_support.h | 7 +- media/libopus/celt/entdec.c | 21 ++ media/libopus/celt/entdec.h | 10 + media/libopus/celt/entenc.c | 11 + media/libopus/celt/entenc.h | 9 + media/libopus/celt/laplace.c | 101 +++++++++ media/libopus/celt/laplace.h | 9 + media/libopus/celt/mathops.h | 6 + media/libopus/celt/mips/celt_mipsr1.h | 6 +- media/libopus/celt/mips/mdct_mipsr1.h | 6 +- media/libopus/celt/mips/vq_mipsr1.h | 6 +- media/libopus/celt/os_support.h | 14 +- media/libopus/celt/pitch.c | 11 +- media/libopus/celt/pitch.h | 11 + media/libopus/celt/stack_alloc.h | 2 +- media/libopus/celt/x86/celt_lpc_sse4_1.c | 13 +- media/libopus/celt/x86/pitch_avx.c | 101 +++++++++ media/libopus/celt/x86/pitch_sse.h | 40 +++- media/libopus/celt/x86/vq_sse.h | 6 +- media/libopus/celt/x86/vq_sse2.c | 8 +- media/libopus/celt/x86/x86_arch_macros.h | 47 ++++ media/libopus/celt/x86/x86_celt_map.c | 20 ++ media/libopus/celt/x86/x86cpu.c | 16 +- media/libopus/celt/x86/x86cpu.h | 49 +++-- 34 files changed, 1014 insertions(+), 183 deletions(-) create mode 100644 media/libopus/celt/x86/pitch_avx.c create mode 100644 media/libopus/celt/x86/x86_arch_macros.h (limited to 'media/libopus/celt') diff --git a/media/libopus/celt/arm/arm_celt_map.c b/media/libopus/celt/arm/arm_celt_map.c index ca988b66f5..cbaea49579 100644 --- a/media/libopus/celt/arm/arm_celt_map.c +++ b/media/libopus/celt/arm/arm_celt_map.c @@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c celt_inner_prod_c, /* ARMv4 */ celt_inner_prod_c, /* EDSP */ celt_inner_prod_c, /* Media */ - celt_inner_prod_neon /* NEON */ + celt_inner_prod_neon,/* NEON */ + celt_inner_prod_neon /* DOTPROD */ }; void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, @@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o dual_inner_prod_c, /* ARMv4 */ dual_inner_prod_c, /* EDSP */ dual_inner_prod_c, /* Media */ - dual_inner_prod_neon /* NEON */ + dual_inner_prod_neon,/* NEON */ + dual_inner_prod_neon /* DOTPROD */ }; # endif @@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, celt_pitch_xcorr_c, /* ARMv4 */ MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ - MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ + MAY_HAVE_NEON(celt_pitch_xcorr), /* NEON */ + MAY_HAVE_NEON(celt_pitch_xcorr) /* DOTPROD */ }; # endif @@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, celt_pitch_xcorr_c, /* ARMv4 */ celt_pitch_xcorr_c, /* EDSP */ celt_pitch_xcorr_c, /* Media */ - celt_pitch_xcorr_float_neon /* Neon */ + celt_pitch_xcorr_float_neon, /* Neon */ + celt_pitch_xcorr_float_neon /* DOTPROD */ }; # endif # endif /* FIXED_POINT */ @@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( xcorr_kernel_c, /* EDSP */ xcorr_kernel_c, /* Media */ xcorr_kernel_neon_fixed, /* Neon */ + xcorr_kernel_neon_fixed /* DOTPROD */ }; #endif @@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = { opus_fft_alloc_arch_c, /* ARMv4 */ opus_fft_alloc_arch_c, /* EDSP */ opus_fft_alloc_arch_c, /* Media */ - opus_fft_alloc_arm_neon /* Neon with NE10 library support */ + opus_fft_alloc_arm_neon, /* Neon with NE10 library support */ + opus_fft_alloc_arm_neon /* DOTPROD with NE10 library support */ }; void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = { opus_fft_free_arch_c, /* ARMv4 */ opus_fft_free_arch_c, /* EDSP */ opus_fft_free_arch_c, /* Media */ - opus_fft_free_arm_neon /* Neon with NE10 */ + opus_fft_free_arm_neon, /* Neon with NE10 */ + opus_fft_free_arm_neon /* DOTPROD with NE10 */ }; # endif /* CUSTOM_MODES */ @@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_fft_c, /* ARMv4 */ opus_fft_c, /* EDSP */ opus_fft_c, /* Media */ - opus_fft_neon /* Neon with NE10 */ + opus_fft_neon, /* Neon with NE10 */ + opus_fft_neon /* DOTPROD with NE10 */ }; void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, @@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_ifft_c, /* ARMv4 */ opus_ifft_c, /* EDSP */ opus_ifft_c, /* Media */ - opus_ifft_neon /* Neon with NE10 */ + opus_ifft_neon, /* Neon with NE10 */ + opus_ifft_neon /* DOTPROD with NE10 */ }; void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, @@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_forward_c, /* ARMv4 */ clt_mdct_forward_c, /* EDSP */ clt_mdct_forward_c, /* Media */ - clt_mdct_forward_neon /* Neon with NE10 */ + clt_mdct_forward_neon, /* Neon with NE10 */ + clt_mdct_forward_neon /* DOTPROD with NE10 */ }; void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, @@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_backward_c, /* ARMv4 */ clt_mdct_backward_c, /* EDSP */ clt_mdct_backward_c, /* Media */ - clt_mdct_backward_neon /* Neon with NE10 */ + clt_mdct_backward_neon, /* Neon with NE10 */ + clt_mdct_backward_neon /* DOTPROD with NE10 */ }; # endif /* HAVE_ARM_NE10 */ diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c index c7d16e6d61..06a53435b8 100644 --- a/media/libopus/celt/arm/armcpu.c +++ b/media/libopus/celt/arm/armcpu.c @@ -43,6 +43,7 @@ #define OPUS_CPU_ARM_EDSP_FLAG (1< +#include + +opus_uint32 opus_cpu_capabilities(void) +{ + opus_uint32 flags = 0; + +#if defined(OPUS_ARM_MAY_HAVE_DOTPROD) + size_t size = sizeof(uint32_t); + uint32_t value = 0; + if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value) + { + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; + } +#endif + +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + return flags; +} + #else /* The feature registers which can tell us what the processor supports are * accessible in priveleged modes only, so we can't have a general user-space @@ -180,7 +223,13 @@ static int opus_select_arch_impl(void) } arch++; - celt_assert(arch == OPUS_ARCH_ARM_NEON); + if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) { + celt_assert(arch == OPUS_ARCH_ARM_NEON); + return arch; + } + arch++; + + celt_assert(arch == OPUS_ARCH_ARM_DOTPROD); return arch; } diff --git a/media/libopus/celt/arm/armcpu.h b/media/libopus/celt/arm/armcpu.h index 820262ff5f..6d5803d81a 100644 --- a/media/libopus/celt/arm/armcpu.h +++ b/media/libopus/celt/arm/armcpu.h @@ -46,6 +46,12 @@ # define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name) # endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) +# define MAY_HAVE_DOTPROD(name) name ## _dotprod +# else +# define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name) +# endif + # if defined(OPUS_ARM_PRESUME_EDSP) # define PRESUME_EDSP(name) name ## _edsp # else @@ -64,6 +70,12 @@ # define PRESUME_NEON(name) PRESUME_MEDIA(name) # endif +# if defined(OPUS_ARM_PRESUME_DOTPROD) +# define PRESUME_DOTPROD(name) name ## _dotprod +# else +# define PRESUME_DOTPROD(name) PRESUME_NEON(name) +# endif + # if defined(OPUS_HAVE_RTCD) int opus_select_arch(void); @@ -71,6 +83,7 @@ int opus_select_arch(void); #define OPUS_ARCH_ARM_EDSP (1) #define OPUS_ARCH_ARM_MEDIA (2) #define OPUS_ARCH_ARM_NEON (3) +#define OPUS_ARCH_ARM_DOTPROD (4) # endif diff --git a/media/libopus/celt/arm/celt_neon_intr.c b/media/libopus/celt/arm/celt_neon_intr.c index effda769d0..250f836218 100644 --- a/media/libopus/celt/arm/celt_neon_intr.c +++ b/media/libopus/celt/arm/celt_neon_intr.c @@ -38,6 +38,8 @@ #include "../pitch.h" #if defined(FIXED_POINT) +#include + void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) { int j; @@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va int16x4_t y0 = vld1_s16(y); y += 4; - for (j = 0; j + 8 <= len; j += 8) + /* This loop loads one y value more than we actually need. + Therefore we have to stop as soon as there are 8 or fewer samples left + (instead of 7), to avoid reading past the end of the array. */ + for (j = 0; j + 8 < len; j += 8) { /* Load x[0...7] */ int16x8_t xx = vld1q_s16(x); @@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va x += 8; y += 8; } - - for (; j < len; j++) - { - int16x4_t x0 = vld1_dup_s16(x); /* load next x */ + if (j + 4 < len) { + /* Load x[0...3] */ + int16x4_t x0 = vld1_s16(x); + /* Load y[4...7] */ + int16x4_t y4 = vld1_s16(y); + int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); + int16x4_t y1 = vext_s16(y0, y4, 1); + int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1); + int16x4_t y2 = vext_s16(y0, y4, 2); + int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2); + int16x4_t y3 = vext_s16(y0, y4, 3); + int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3); + y0 = y4; + a = a3; + x += 4; + y += 4; + j += 4; + } + if (j + 2 < len) { + /* Load x[0...1] */ + int16x4x2_t xx = vld2_dup_s16(x); + int16x4_t x0 = xx.val[0]; + int16x4_t x1 = xx.val[1]; + /* Load y[4...5]. + We would like to use vld1_dup_s32(), but casting the pointer would + break strict aliasing rules and potentially have alignment issues. + Fortunately the compiler seems capable of translating this memcpy() + and vdup_n_s32() into the equivalent vld1_dup_s32().*/ + int32_t yy; + memcpy(&yy, y, sizeof(yy)); + int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy)); int32x4_t a0 = vmlal_s16(a, y0, x0); - - int16x4_t y4 = vld1_dup_s16(y); /* load next y */ - y0 = vext_s16(y0, y4, 1); + int16x4_t y1 = vext_s16(y0, y4, 1); + /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0, + using VSRI instead of VEXT, since it's a data-processing + instruction. */ + y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), + vreinterpret_s64_s16(y0), 32)); + int32x4_t a1 = vmlal_s16(a0, y1, x1); + a = a1; + x += 2; + y += 2; + j += 2; + } + if (j + 1 < len) { + /* Load next x. */ + int16x4_t x0 = vld1_dup_s16(x); + int32x4_t a0 = vmlal_s16(a, y0, x0); + /* Load last y. */ + int16x4_t y4 = vld1_dup_s16(y); + y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), + vreinterpret_s64_s16(y0), 16)); a = a0; x++; - y++; } - - vst1q_s32(sum, a); + /* Load last x. */ + int16x4_t x0 = vld1_dup_s16(x); + int32x4_t a0 = vmlal_s16(a, y0, x0); + vst1q_s32(sum, a0); } #else + +#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64) +/* If we can, force the compiler to use an FMA instruction rather than break + * vmlaq_f32() into fmul/fadd. */ +#ifdef vmlaq_lane_f32 +#undef vmlaq_lane_f32 +#endif +#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane) +#endif + + /* * Function: xcorr_kernel_neon_float * --------------------------------- diff --git a/media/libopus/celt/arm/pitch_neon_intr.c b/media/libopus/celt/arm/pitch_neon_intr.c index 35cc46e2c2..43885f528c 100644 --- a/media/libopus/celt/arm/pitch_neon_intr.c +++ b/media/libopus/celt/arm/pitch_neon_intr.c @@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* ========================================================================== */ +#ifdef __ARM_FEATURE_FMA +/* If we can, force the compiler to use an FMA instruction rather than break + vmlaq_f32() into fmul/fadd. */ +#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c) +#endif + + #ifdef OPUS_CHECK_ASM /* This part of code simulates floating-point NEON operations. */ diff --git a/media/libopus/celt/celt.h b/media/libopus/celt/celt.h index 24b6b2b520..2f501951d5 100644 --- a/media/libopus/celt/celt.h +++ b/media/libopus/celt/celt.h @@ -42,6 +42,10 @@ #include "entdec.h" #include "arch.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#endif + #ifdef __cplusplus extern "C" { #endif @@ -149,6 +153,13 @@ int celt_decoder_get_size(int channels); int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels); +int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ); + int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum); @@ -225,23 +236,13 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, const opus_val16 *window, int overlap, int arch); -#ifdef NON_STATIC_COMB_FILTER_CONST_C -void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12); -#endif - -#ifndef OVERRIDE_COMB_FILTER_CONST -# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ - ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) -#endif - void init_caps(const CELTMode *m,int *cap,int LM,int C); #ifdef RESYNTH -void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem); +void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum); void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[], opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient, - int LM, int downsample, int silence); + int LM, int downsample, int silence, int arch); #endif #ifdef __cplusplus diff --git a/media/libopus/celt/celt_decoder.c b/media/libopus/celt/celt_decoder.c index 883dae15d2..743c2031bc 100644 --- a/media/libopus/celt/celt_decoder.c +++ b/media/libopus/celt/celt_decoder.c @@ -51,6 +51,11 @@ #include "celt_lpc.h" #include "vq.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#include "lpcnet_private.h" +#endif + /* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The current value corresponds to a pitch of 66.67 Hz. */ @@ -59,9 +64,6 @@ pitch of 480 Hz. */ #define PLC_PITCH_LAG_MIN (100) -#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT) -#define NORM_ALIASING_HACK -#endif /**********************************************************************/ /* */ /* DECODER */ @@ -69,6 +71,9 @@ /**********************************************************************/ #define DECODE_BUFFER_SIZE 2048 +#define PLC_UPDATE_FRAMES 4 +#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE) + /** Decoder state @brief Decoder state */ @@ -82,6 +87,7 @@ struct OpusCustomDecoder { int start, end; int signalling; int disable_inv; + int complexity; int arch; /* Everything beyond this point gets cleared on a reset */ @@ -98,11 +104,18 @@ struct OpusCustomDecoder { opus_val16 postfilter_gain_old; int postfilter_tapset; int postfilter_tapset_old; + int prefilter_and_fold; celt_sig preemph_memD[2]; +#ifdef ENABLE_DEEP_PLC + opus_int16 plc_pcm[PLC_UPDATE_SAMPLES]; + int plc_fill; + float plc_preemphasis_mem; +#endif + celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */ - /* opus_val16 lpc[], Size = channels*LPC_ORDER */ + /* opus_val16 lpc[], Size = channels*CELT_LPC_ORDER */ /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */ /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */ /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */ @@ -157,7 +170,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int { int size = sizeof(struct CELTDecoder) + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig) - + channels*LPC_ORDER*sizeof(opus_val16) + + channels*CELT_LPC_ORDER*sizeof(opus_val16) + 4*2*mode->nbEBands*sizeof(opus_val16); return size; } @@ -499,7 +512,100 @@ static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch) return pitch_index; } -static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) +static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N) +{ + int c; + int CC; + int i; + int overlap; + celt_sig *decode_mem[2]; + const OpusCustomMode *mode; + VARDECL(opus_val32, etmp); + mode = st->mode; + overlap = st->overlap; + CC = st->channels; + ALLOC(etmp, overlap, opus_val32); + c=0; do { + decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap); + } while (++cpostfilter_period_old, st->postfilter_period, overlap, + -st->postfilter_gain_old, -st->postfilter_gain, + st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch); + + /* Simulate TDAC on the concealed audio so that it blends with the + MDCT of the next frame. */ + for (i=0;iwindow[i], etmp[overlap-1-i]) + + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]); + } + } while (++cfec_read_pos; + tmp_fec_skip = lpcnet->fec_skip; + for (i=0;ifec_read_pos = tmp_read_post; + lpcnet->fec_skip = tmp_fec_skip; +} +#endif + +static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ) { int c; int i; @@ -527,22 +633,22 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N; } while (++c_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C); - oldBandE = lpc+C*LPC_ORDER; + oldBandE = lpc+C*CELT_LPC_ORDER; oldLogE = oldBandE + 2*nbEBands; oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; loss_duration = st->loss_duration; start = st->start; +#ifdef ENABLE_DEEP_PLC + noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80)); +#else noise_based = loss_duration >= 40 || start != 0 || st->skip_plc; +#endif if (noise_based) { /* Noise-based PLC/CNG */ -#ifdef NORM_ALIASING_HACK - celt_norm *X; -#else VARDECL(celt_norm, X); -#endif opus_uint32 seed; int end; int effEnd; @@ -550,18 +656,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) end = st->end; effEnd = IMAX(start, IMIN(end, mode->effEBands)); -#ifdef NORM_ALIASING_HACK - /* This is an ugly hack that breaks aliasing rules and would be easily broken, - but it saves almost 4kB of stack. */ - X = (celt_norm*)(out_syn[C-1]+overlap/2); -#else ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ -#endif c=0; do { OPUS_MOVE(decode_mem[c], decode_mem[c]+N, - DECODE_BUFFER_SIZE-N+(overlap>>1)); + DECODE_BUFFER_SIZE-N+overlap); } while (++cprefilter_and_fold) { + prefilter_and_fold(st, N); + } + /* Energy decay */ decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); c=0; do @@ -590,6 +694,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) st->rng = seed; celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch); + st->prefilter_and_fold = 0; + /* Skip regular PLC until we get two consecutive packets. */ + st->skip_plc = 1; } else { int exc_length; /* Pitch-based PLC */ @@ -597,12 +704,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val16 *exc; opus_val16 fade = Q15ONE; int pitch_index; - VARDECL(opus_val32, etmp); VARDECL(opus_val16, _exc); VARDECL(opus_val16, fir_tmp); if (loss_duration == 0) { +#ifdef ENABLE_DEEP_PLC + if (lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C); +#endif st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch); } else { pitch_index = st->last_pitch_index; @@ -613,10 +722,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) decaying signal, but we can't get more than MAX_PERIOD. */ exc_length = IMIN(2*pitch_index, MAX_PERIOD); - ALLOC(etmp, overlap, opus_val32); - ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16); + ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16); ALLOC(fir_tmp, exc_length, opus_val16); - exc = _exc+LPC_ORDER; + exc = _exc+CELT_LPC_ORDER; window = mode->window; c=0; do { opus_val16 decay; @@ -628,16 +736,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) int j; buf = decode_mem[c]; - for (i=0;iarch); + CELT_LPC_ORDER, MAX_PERIOD, st->arch); /* Add a noise floor of -40 dB. */ #ifdef FIXED_POINT ac[0] += SHR32(ac[0],13); @@ -645,7 +753,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) ac[0] *= 1.0001f; #endif /* Use lag windowing to stabilize the Levinson-Durbin recursion. */ - for (i=1;i<=LPC_ORDER;i++) + for (i=1;i<=CELT_LPC_ORDER;i++) { /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/ #ifdef FIXED_POINT @@ -654,7 +762,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) ac[i] -= ac[i]*(0.008f*0.008f)*i*i; #endif } - _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER); + _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER); #ifdef FIXED_POINT /* For fixed-point, apply bandwidth expansion until we can guarantee that no overflow can happen in the IIR filter. This means: @@ -662,13 +770,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) while (1) { opus_val16 tmp=Q15ONE; opus_val32 sum=QCONST16(1., SIG_SHIFT); - for (i=0;iarch); + celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER, + fir_tmp, exc_length, CELT_LPC_ORDER, st->arch); OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length); } @@ -737,15 +845,15 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) S1 += SHR32(MULT16_16(tmp, tmp), 10); } { - opus_val16 lpc_mem[LPC_ORDER]; + opus_val16 lpc_mem[CELT_LPC_ORDER]; /* Copy the last decoded samples (prior to the overlap region) to synthesis filter memory so we can have a continuous signal. */ - for (i=0;iarch); #ifdef FIXED_POINT for (i=0; i < extrapolation_len; i++) @@ -792,23 +900,65 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } } - /* Apply the pre-filter to the MDCT overlap for the next frame because - the post-filter will be re-applied in the decoder after the MDCT - overlap. */ - comb_filter(etmp, buf+DECODE_BUFFER_SIZE, - st->postfilter_period, st->postfilter_period, overlap, - -st->postfilter_gain, -st->postfilter_gain, - st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); - - /* Simulate TDAC on the concealed audio so that it blends with the - MDCT of the next frame. */ - for (i=0;iloaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) { + float overlap_mem; + int samples_needed16k; + celt_sig *buf; + VARDECL(float, buf_copy); + buf = decode_mem[0]; + ALLOC(buf_copy, C*overlap, float); + c=0; do { + OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap); + } while (++cplc_fill = 0; + } + while (st->plc_fill < samples_needed16k) { + lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]); + st->plc_fill += FRAME_SIZE; + } + /* Resample to 48 kHz. */ + for (i=0;i<(N+overlap)/3;i++) { + int j; + float sum; + for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j]; + buf[DECODE_BUFFER_SIZE-N+3*i] = sum; + for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2]; + buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum; + for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1]; + buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum; + } + OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3); + st->plc_fill -= N/3; + for (i=0;iplc_preemphasis_mem; + st->plc_preemphasis_mem = tmp; + } + overlap_mem = st->plc_preemphasis_mem; + for (i=0;iprefilter_and_fold = 1; } /* Saturate to soemthing large to avoid wrap-around. */ @@ -817,18 +967,18 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) RESTORE_STACK; } -int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, - int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum) +int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ) { int c, i, N; int spread_decision; opus_int32 bits; ec_dec _dec; -#ifdef NORM_ALIASING_HACK - celt_norm *X; -#else VARDECL(celt_norm, X); -#endif VARDECL(int, fine_quant); VARDECL(int, pulses); VARDECL(int, cap); @@ -881,7 +1031,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat frame_size *= st->downsample; lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC); - oldBandE = lpc+CC*LPC_ORDER; + oldBandE = lpc+CC*CELT_LPC_ORDER; oldLogE = oldBandE + 2*nbEBands; oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; @@ -935,15 +1085,25 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (data == NULL || len<=1) { - celt_decode_lost(st, N, LM); + celt_decode_lost(st, N, LM +#ifdef ENABLE_DEEP_PLC + , lpcnet +#endif + ); deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); RESTORE_STACK; return frame_size/st->downsample; } +#ifdef ENABLE_DEEP_PLC + else { + /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */ + if (lpcnet) lpcnet->blend = 0; + } +#endif /* Check if there are at least two packets received consecutively before * turning on the pitch-based PLC */ - st->skip_plc = st->loss_duration != 0; + if (st->loss_duration == 0) st->skip_plc = 0; if (dec == NULL) { @@ -1006,6 +1166,36 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat /* Decode the global flags (first symbols in the stream) */ intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0; + /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the + risk of getting loud artifacts. */ + if (!intra_ener && st->loss_duration != 0) { + c=0; do + { + opus_val16 safety = 0; + int missing = IMIN(10, st->loss_duration>>LM); + if (LM==0) safety = QCONST16(1.5f,DB_SHIFT); + else if (LM==1) safety = QCONST16(.5f,DB_SHIFT); + for (i=start;iprefilter_and_fold) { + prefilter_and_fold(st, N); + } celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, CC, isTransient, LM, st->downsample, silence, st->arch); @@ -1173,6 +1359,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); st->loss_duration = 0; + st->prefilter_and_fold = 0; RESTORE_STACK; if (ec_tell(dec) > 8*len) return OPUS_INTERNAL_ERROR; @@ -1181,6 +1368,15 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat return frame_size/st->downsample; } +int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum) +{ + return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum +#ifdef ENABLE_DEEP_PLC + , NULL +#endif + ); +} #ifdef CUSTOM_MODES @@ -1254,6 +1450,26 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...) va_start(ap, request); switch (request) { + case OPUS_SET_COMPLEXITY_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>10) + { + goto bad_arg; + } + st->complexity = value; + } + break; + case OPUS_GET_COMPLEXITY_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->complexity; + } + break; case CELT_SET_START_BAND_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); @@ -1300,7 +1516,7 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...) int i; opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2; lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels); - oldBandE = lpc+st->channels*LPC_ORDER; + oldBandE = lpc+st->channels*CELT_LPC_ORDER; oldLogE = oldBandE + 2*st->mode->nbEBands; oldLogE2 = oldLogE + 2*st->mode->nbEBands; OPUS_CLEAR((char*)&st->DECODER_RESET_START, diff --git a/media/libopus/celt/celt_encoder.c b/media/libopus/celt/celt_encoder.c index 637d442cf7..7f32a801c6 100644 --- a/media/libopus/celt/celt_encoder.c +++ b/media/libopus/celt/celt_encoder.c @@ -281,6 +281,9 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */ for (i=0;i 50 && LM>=1 && !lfe) + /* Make sure that dynamic allocation can't make us bust the budget. + We enable the feature starting at 24 kb/s for 20-ms frames + and 96 kb/s for 2.5 ms frames. */ + if (effectiveBytes >= (30 + 5*LM) && !lfe) { int last=0; c=0;do @@ -1042,30 +1057,38 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 opus_val16 offset; opus_val16 tmp; opus_val16 *f; + OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end); + if (LM==0) { + /* For 2.5 ms frames, the first 8 bands have just one bin, so the + energy is highly unreliable (high variance). For that reason, + we take the max with the previous energy so that at least 2 bins + are getting used. */ + for (i=0;i bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT)) + if (bandLogE3[i] > bandLogE3[i-1]+QCONST16(.5f,DB_SHIFT)) last=i; - f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]); + f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE3[i]); } for (i=last-1;i>=0;i--) - f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i])); + f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE3[i])); /* Combine with a median filter to avoid dynalloc triggering unnecessarily. The "offset" value controls how conservative we are -- a higher offset reduces the impact of the median filter and makes dynalloc use more bits. */ offset = QCONST16(1.f, DB_SHIFT); for (i=2;ibitrate*frame_size; if (tell>1) - tmp += tell; + tmp += tell*mode->Fs; if (st->bitrate!=OPUS_BITRATE_MAX) + { nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes, (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling)); + ec_enc_shrink(enc, nbCompressedBytes); + } effectiveBytes = nbCompressedBytes - nbFilledBytes; } equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50); @@ -1882,7 +1908,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, ALLOC(importance, nbEBands, int); ALLOC(spread_weight, nbEBands, int); - maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets, + maxDepth = dynalloc_analysis(bandLogE, bandLogE2, oldBandE, nbEBands, start, end, C, offsets, st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr, eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight); @@ -2246,7 +2272,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (anti_collapse_on) { anti_collapse(mode, X, collapse_masks, LM, C, N, - start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng); + start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch); } c=0; do { @@ -2265,15 +2291,15 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD); comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize, st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); if (LM!=0) comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize, st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); } while (++cupsample, mode->preemph, st->preemph_memD); + deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0); st->prefilter_period_old = st->prefilter_period; st->prefilter_gain_old = st->prefilter_gain; st->prefilter_tapset_old = st->prefilter_tapset; diff --git a/media/libopus/celt/celt_lpc.c b/media/libopus/celt/celt_lpc.c index f91721bcab..fabca65cb3 100644 --- a/media/libopus/celt/celt_lpc.c +++ b/media/libopus/celt/celt_lpc.c @@ -44,7 +44,7 @@ int p opus_val32 r; opus_val32 error = ac[0]; #ifdef FIXED_POINT - opus_val32 lpc[LPC_ORDER]; + opus_val32 lpc[CELT_LPC_ORDER]; #else float *lpc = _lpc; #endif @@ -158,7 +158,17 @@ void celt_fir_c( sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT); sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT); sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT); - xcorr_kernel(rnum, x+i-ord, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + { + opus_val32 sum_c[4]; + memcpy(sum_c, sum, sizeof(sum_c)); + xcorr_kernel_c(rnum, x+i-ord, sum_c, ord); +#endif + xcorr_kernel(rnum, x+i-ord, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0); + } +#endif y[i ] = SROUND16(sum[0], SIG_SHIFT); y[i+1] = SROUND16(sum[1], SIG_SHIFT); y[i+2] = SROUND16(sum[2], SIG_SHIFT); @@ -222,8 +232,17 @@ void celt_iir(const opus_val32 *_x, sum[1]=_x[i+1]; sum[2]=_x[i+2]; sum[3]=_x[i+3]; - xcorr_kernel(rden, y+i, sum, ord, arch); - +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + { + opus_val32 sum_c[4]; + memcpy(sum_c, sum, sizeof(sum_c)); + xcorr_kernel_c(rden, y+i, sum_c, ord); +#endif + xcorr_kernel(rden, y+i, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0); + } +#endif /* Patch up the result to compensate for the fact that this is an IIR */ y[i+ord ] = -SROUND16(sum[0],SIG_SHIFT); _y[i ] = sum[0]; diff --git a/media/libopus/celt/celt_lpc.h b/media/libopus/celt/celt_lpc.h index a4c5fd6ea5..97dee82f02 100644 --- a/media/libopus/celt/celt_lpc.h +++ b/media/libopus/celt/celt_lpc.h @@ -35,7 +35,7 @@ #include "x86/celt_lpc_sse.h" #endif -#define LPC_ORDER 24 +#define CELT_LPC_ORDER 24 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); diff --git a/media/libopus/celt/cpu_support.h b/media/libopus/celt/cpu_support.h index 7b5c56ca90..9f13d8aecf 100644 --- a/media/libopus/celt/cpu_support.h +++ b/media/libopus/celt/cpu_support.h @@ -35,19 +35,20 @@ (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) #include "arm/armcpu.h" -/* We currently support 4 ARM variants: +/* We currently support 5 ARM variants: * arch[0] -> ARMv4 * arch[1] -> ARMv5E * arch[2] -> ARMv6 * arch[3] -> NEON + * arch[4] -> NEON+DOTPROD */ -#define OPUS_ARCHMASK 3 +#define OPUS_ARCHMASK 7 #elif defined(OPUS_HAVE_RTCD) && \ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) #include "x86/x86cpu.h" /* We currently support 5 x86 variants: diff --git a/media/libopus/celt/entdec.c b/media/libopus/celt/entdec.c index 0b3433ed8b..027aa24bca 100644 --- a/media/libopus/celt/entdec.c +++ b/media/libopus/celt/entdec.c @@ -195,6 +195,27 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){ return ret; } +int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb){ + opus_uint32 r; + opus_uint32 d; + opus_uint32 s; + opus_uint32 t; + int ret; + s=_this->rng; + d=_this->val; + r=s>>_ftb; + ret=-1; + do{ + t=s; + s=IMUL32(r,_icdf[++ret]); + } + while(dval=d-s; + _this->rng=t-s; + ec_dec_normalize(_this); + return ret; +} + opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){ unsigned ft; unsigned s; diff --git a/media/libopus/celt/entdec.h b/media/libopus/celt/entdec.h index 025fc1870d..c81f26fdb2 100644 --- a/media/libopus/celt/entdec.h +++ b/media/libopus/celt/entdec.h @@ -81,6 +81,16 @@ int ec_dec_bit_logp(ec_dec *_this,unsigned _logp); Return: The decoded symbol s.*/ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb); +/*Decodes a symbol given an "inverse" CDF table. + No call to ec_dec_update() is necessary after this call. + _icdf: The "inverse" CDF, such that symbol s falls in the range + [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution. + Return: The decoded symbol s.*/ +int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb); + /*Extracts a raw unsigned integer with a non-power-of-2 range from the stream. The bits must have been encoded with ec_enc_uint(). No call to ec_dec_update() is necessary after this call. diff --git a/media/libopus/celt/entenc.c b/media/libopus/celt/entenc.c index f1750d25b8..69c6f835d0 100644 --- a/media/libopus/celt/entenc.c +++ b/media/libopus/celt/entenc.c @@ -172,6 +172,17 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){ ec_enc_normalize(_this); } +void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb){ + opus_uint32 r; + r=_this->rng>>_ftb; + if(_s>0){ + _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]); + _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]); + } + else _this->rng-=IMUL32(r,_icdf[_s]); + ec_enc_normalize(_this); +} + void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){ unsigned ft; unsigned fl; diff --git a/media/libopus/celt/entenc.h b/media/libopus/celt/entenc.h index f502eaf662..010874bbc1 100644 --- a/media/libopus/celt/entenc.h +++ b/media/libopus/celt/entenc.h @@ -64,6 +64,15 @@ void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp); _ftb: The number of bits of precision in the cumulative distribution.*/ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb); +/*Encodes a symbol given an "inverse" CDF table. + _s: The index of the symbol to encode. + _icdf: The "inverse" CDF, such that symbol _s falls in the range + [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution.*/ +void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb); + /*Encodes a raw unsigned integer in the stream. _fl: The integer to encode. _ft: The number of integers that can be encoded (one more than the max). diff --git a/media/libopus/celt/laplace.c b/media/libopus/celt/laplace.c index a7bca874b6..2180966662 100644 --- a/media/libopus/celt/laplace.c +++ b/media/libopus/celt/laplace.c @@ -132,3 +132,104 @@ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay) ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768); return val; } + +void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay) +{ + int s; + opus_uint16 sign_icdf[3]; + sign_icdf[0] = 32768-p0; + sign_icdf[1] = sign_icdf[0]/2; + sign_icdf[2] = 0; + s = value == 0 ? 0 : (value > 0 ? 1 : 2); + ec_enc_icdf16(enc, s, sign_icdf, 15); + value = abs(value); + if (value) + { + int i; + opus_uint16 icdf[8]; + icdf[0] = IMAX(7, decay); + for (i=1;i<7;i++) + { + icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15); + } + icdf[7] = 0; + value--; + do { + ec_enc_icdf16(enc, IMIN(value, 7), icdf, 15); + value -= 7; + } while (value >= 0); + } +} + +int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay) +{ + int s; + int value; + opus_uint16 sign_icdf[3]; + sign_icdf[0] = 32768-p0; + sign_icdf[1] = sign_icdf[0]/2; + sign_icdf[2] = 0; + s = ec_dec_icdf16(dec, sign_icdf, 15); + if (s==2) s = -1; + if (s != 0) + { + int i; + int v; + opus_uint16 icdf[8]; + icdf[0] = IMAX(7, decay); + for (i=1;i<7;i++) + { + icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15); + } + icdf[7] = 0; + value = 1; + do { + v = ec_dec_icdf16(dec, icdf, 15); + value += v; + } while (v == 7); + return s*value; + } else return 0; +} + +#if 0 + +#include +#define NB_VALS 10 +#define DATA_SIZE 10000 +int main() { + ec_enc enc; + ec_dec dec; + unsigned char *ptr; + int i; + int decay, p0; + int val[NB_VALS] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + /*for (i=0;i #include -/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */ +/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */ #ifndef OVERRIDE_OPUS_ALLOC static OPUS_INLINE void *opus_alloc (size_t size) { @@ -49,7 +49,15 @@ static OPUS_INLINE void *opus_alloc (size_t size) } #endif -/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */ +#ifndef OVERRIDE_OPUS_REALLOC +static OPUS_INLINE void *opus_realloc (void *ptr, size_t size) +{ + return realloc(ptr, size); +} +#endif + +/** Used only for non-threadsafe pseudostack. + If desired, this can always return the same area of memory rather than allocating a new one every time. */ #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH static OPUS_INLINE void *opus_alloc_scratch (size_t size) { @@ -58,7 +66,7 @@ static OPUS_INLINE void *opus_alloc_scratch (size_t size) } #endif -/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */ +/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */ #ifndef OVERRIDE_OPUS_FREE static OPUS_INLINE void opus_free (void *ptr) { diff --git a/media/libopus/celt/pitch.c b/media/libopus/celt/pitch.c index 7998db4164..e33c60a3bf 100644 --- a/media/libopus/celt/pitch.c +++ b/media/libopus/celt/pitch.c @@ -262,7 +262,16 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, for (i=0;i +#include "x86cpu.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT) + +/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */ +static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len) +{ + __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7; + xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps(); + int i; + __m256 x0; + /* Compute 8 inner products using partial sums. */ + for (i=0;i0); + (void)arch; + for (i=0;iHW_SSE = (info[3] & (1 << 25)) != 0; cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; - cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0; + cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0; + if (cpu_feature->HW_AVX2 && nIds >= 7) { + cpuid(info, 7); + cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0; + } else { + cpu_feature->HW_AVX2 = 0; + } } else { cpu_feature->HW_SSE = 0; cpu_feature->HW_SSE2 = 0; cpu_feature->HW_SSE41 = 0; - cpu_feature->HW_AVX = 0; + cpu_feature->HW_AVX2 = 0; } } @@ -157,7 +163,7 @@ static int opus_select_arch_impl(void) } arch++; - if (!cpu_feature.HW_AVX) + if (!cpu_feature.HW_AVX2) { return arch; } diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h index 04e80489b1..8ae9be8d8f 100644 --- a/media/libopus/celt/x86/x86cpu.h +++ b/media/libopus/celt/x86/x86cpu.h @@ -46,28 +46,53 @@ # define MAY_HAVE_SSE4_1(name) name ## _c # endif -# if defined(OPUS_X86_MAY_HAVE_AVX) -# define MAY_HAVE_AVX(name) name ## _avx +# if defined(OPUS_X86_MAY_HAVE_AVX2) +# define MAY_HAVE_AVX2(name) name ## _avx2 # else -# define MAY_HAVE_AVX(name) name ## _c +# define MAY_HAVE_AVX2(name) name ## _c # endif -# if defined(OPUS_HAVE_RTCD) +# if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) int opus_select_arch(void); # endif +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# include "opus_defines.h" + /*MOVD should not impose any alignment restrictions, but the C standard does, and UBSan will report errors if we actually make unaligned accesses. Use this to work around those restrictions (which should hopefully all get - optimized to a single MOVD instruction).*/ -#define OP_LOADU_EPI32(x) \ - (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\ - *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U)) + optimized to a single MOVD instruction). + GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug! + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */ +# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8)) +# include +# include + +# ifdef _mm_loadu_si32 +# undef _mm_loadu_si32 +# endif +# define _mm_loadu_si32 WORKAROUND_mm_loadu_si32 +static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) { + int val; + memcpy(&val, mem_addr, sizeof(val)); + return _mm_cvtsi32_si128(val); +} +# elif defined(_MSC_VER) + /* MSVC needs this for _mm_loadu_si32 */ +# include +# endif -#define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x)))) +# define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_loadu_si32(x))) -#define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x)))) + +# endif #endif -- cgit v1.2.3