diff options
Diffstat (limited to 'media/libopus/celt')
34 files changed, 1014 insertions, 183 deletions
diff --git a/media/libopus/celt/arm/arm_celt_map.c b/media/libopus/celt/arm/arm_celt_map.c index ca988b66f5..cbaea49579 100644 --- a/media/libopus/celt/arm/arm_celt_map.c +++ b/media/libopus/celt/arm/arm_celt_map.c @@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c celt_inner_prod_c, /* ARMv4 */ celt_inner_prod_c, /* EDSP */ celt_inner_prod_c, /* Media */ - celt_inner_prod_neon /* NEON */ + celt_inner_prod_neon,/* NEON */ + celt_inner_prod_neon /* DOTPROD */ }; void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, @@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o dual_inner_prod_c, /* ARMv4 */ dual_inner_prod_c, /* EDSP */ dual_inner_prod_c, /* Media */ - dual_inner_prod_neon /* NEON */ + dual_inner_prod_neon,/* NEON */ + dual_inner_prod_neon /* DOTPROD */ }; # endif @@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, celt_pitch_xcorr_c, /* ARMv4 */ MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ - MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ + MAY_HAVE_NEON(celt_pitch_xcorr), /* NEON */ + MAY_HAVE_NEON(celt_pitch_xcorr) /* DOTPROD */ }; # endif @@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, celt_pitch_xcorr_c, /* ARMv4 */ celt_pitch_xcorr_c, /* EDSP */ celt_pitch_xcorr_c, /* Media */ - celt_pitch_xcorr_float_neon /* Neon */ + celt_pitch_xcorr_float_neon, /* Neon */ + celt_pitch_xcorr_float_neon /* DOTPROD */ }; # endif # endif /* FIXED_POINT */ @@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( xcorr_kernel_c, /* EDSP */ xcorr_kernel_c, /* Media */ xcorr_kernel_neon_fixed, /* Neon */ + xcorr_kernel_neon_fixed /* DOTPROD */ }; #endif @@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = { opus_fft_alloc_arch_c, /* ARMv4 */ opus_fft_alloc_arch_c, /* EDSP */ opus_fft_alloc_arch_c, /* Media */ - opus_fft_alloc_arm_neon /* Neon with NE10 library support */ + opus_fft_alloc_arm_neon, /* Neon with NE10 library support */ + opus_fft_alloc_arm_neon /* DOTPROD with NE10 library support */ }; void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = { opus_fft_free_arch_c, /* ARMv4 */ opus_fft_free_arch_c, /* EDSP */ opus_fft_free_arch_c, /* Media */ - opus_fft_free_arm_neon /* Neon with NE10 */ + opus_fft_free_arm_neon, /* Neon with NE10 */ + opus_fft_free_arm_neon /* DOTPROD with NE10 */ }; # endif /* CUSTOM_MODES */ @@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_fft_c, /* ARMv4 */ opus_fft_c, /* EDSP */ opus_fft_c, /* Media */ - opus_fft_neon /* Neon with NE10 */ + opus_fft_neon, /* Neon with NE10 */ + opus_fft_neon /* DOTPROD with NE10 */ }; void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, @@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_ifft_c, /* ARMv4 */ opus_ifft_c, /* EDSP */ opus_ifft_c, /* Media */ - opus_ifft_neon /* Neon with NE10 */ + opus_ifft_neon, /* Neon with NE10 */ + opus_ifft_neon /* DOTPROD with NE10 */ }; void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, @@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_forward_c, /* ARMv4 */ clt_mdct_forward_c, /* EDSP */ clt_mdct_forward_c, /* Media */ - clt_mdct_forward_neon /* Neon with NE10 */ + clt_mdct_forward_neon, /* Neon with NE10 */ + clt_mdct_forward_neon /* DOTPROD with NE10 */ }; void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, @@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_backward_c, /* ARMv4 */ clt_mdct_backward_c, /* EDSP */ clt_mdct_backward_c, /* Media */ - clt_mdct_backward_neon /* Neon with NE10 */ + clt_mdct_backward_neon, /* Neon with NE10 */ + clt_mdct_backward_neon /* DOTPROD with NE10 */ }; # endif /* HAVE_ARM_NE10 */ diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c index c7d16e6d61..06a53435b8 100644 --- a/media/libopus/celt/arm/armcpu.c +++ b/media/libopus/celt/arm/armcpu.c @@ -43,6 +43,7 @@ #define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP) #define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA) #define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON) +#define OPUS_CPU_ARM_DOTPROD_FLAG (1<<OPUS_ARCH_ARM_DOTPROD) #if defined(_MSC_VER) /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ @@ -126,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void) p = strstr(buf, " neon"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_NEON_FLAG; + p = strstr(buf, " asimd"); + if(p != NULL && (p[6] == ' ' || p[6] == '\n')) + flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG; +# endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) + p = strstr(buf, " asimddp"); + if(p != NULL && (p[8] == ' ' || p[8] == '\n')) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; # endif } # endif @@ -144,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void) # endif } +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + fclose(cpuinfo); } return flags; } + +#elif defined(__APPLE__) +#include <sys/types.h> +#include <sys/sysctl.h> + +opus_uint32 opus_cpu_capabilities(void) +{ + opus_uint32 flags = 0; + +#if defined(OPUS_ARM_MAY_HAVE_DOTPROD) + size_t size = sizeof(uint32_t); + uint32_t value = 0; + if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value) + { + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; + } +#endif + +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + return flags; +} + #else /* The feature registers which can tell us what the processor supports are * accessible in priveleged modes only, so we can't have a general user-space @@ -180,7 +223,13 @@ static int opus_select_arch_impl(void) } arch++; - celt_assert(arch == OPUS_ARCH_ARM_NEON); + if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) { + celt_assert(arch == OPUS_ARCH_ARM_NEON); + return arch; + } + arch++; + + celt_assert(arch == OPUS_ARCH_ARM_DOTPROD); return arch; } diff --git a/media/libopus/celt/arm/armcpu.h b/media/libopus/celt/arm/armcpu.h index 820262ff5f..6d5803d81a 100644 --- a/media/libopus/celt/arm/armcpu.h +++ b/media/libopus/celt/arm/armcpu.h @@ -46,6 +46,12 @@ # define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name) # endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) +# define MAY_HAVE_DOTPROD(name) name ## _dotprod +# else +# define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name) +# endif + # if defined(OPUS_ARM_PRESUME_EDSP) # define PRESUME_EDSP(name) name ## _edsp # else @@ -64,6 +70,12 @@ # define PRESUME_NEON(name) PRESUME_MEDIA(name) # endif +# if defined(OPUS_ARM_PRESUME_DOTPROD) +# define PRESUME_DOTPROD(name) name ## _dotprod +# else +# define PRESUME_DOTPROD(name) PRESUME_NEON(name) +# endif + # if defined(OPUS_HAVE_RTCD) int opus_select_arch(void); @@ -71,6 +83,7 @@ int opus_select_arch(void); #define OPUS_ARCH_ARM_EDSP (1) #define OPUS_ARCH_ARM_MEDIA (2) #define OPUS_ARCH_ARM_NEON (3) +#define OPUS_ARCH_ARM_DOTPROD (4) # endif diff --git a/media/libopus/celt/arm/celt_neon_intr.c b/media/libopus/celt/arm/celt_neon_intr.c index effda769d0..250f836218 100644 --- a/media/libopus/celt/arm/celt_neon_intr.c +++ b/media/libopus/celt/arm/celt_neon_intr.c @@ -38,6 +38,8 @@ #include "../pitch.h" #if defined(FIXED_POINT) +#include <string.h> + void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) { int j; @@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va int16x4_t y0 = vld1_s16(y); y += 4; - for (j = 0; j + 8 <= len; j += 8) + /* This loop loads one y value more than we actually need. + Therefore we have to stop as soon as there are 8 or fewer samples left + (instead of 7), to avoid reading past the end of the array. */ + for (j = 0; j + 8 < len; j += 8) { /* Load x[0...7] */ int16x8_t xx = vld1q_s16(x); @@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va x += 8; y += 8; } - - for (; j < len; j++) - { - int16x4_t x0 = vld1_dup_s16(x); /* load next x */ + if (j + 4 < len) { + /* Load x[0...3] */ + int16x4_t x0 = vld1_s16(x); + /* Load y[4...7] */ + int16x4_t y4 = vld1_s16(y); + int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); + int16x4_t y1 = vext_s16(y0, y4, 1); + int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1); + int16x4_t y2 = vext_s16(y0, y4, 2); + int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2); + int16x4_t y3 = vext_s16(y0, y4, 3); + int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3); + y0 = y4; + a = a3; + x += 4; + y += 4; + j += 4; + } + if (j + 2 < len) { + /* Load x[0...1] */ + int16x4x2_t xx = vld2_dup_s16(x); + int16x4_t x0 = xx.val[0]; + int16x4_t x1 = xx.val[1]; + /* Load y[4...5]. + We would like to use vld1_dup_s32(), but casting the pointer would + break strict aliasing rules and potentially have alignment issues. + Fortunately the compiler seems capable of translating this memcpy() + and vdup_n_s32() into the equivalent vld1_dup_s32().*/ + int32_t yy; + memcpy(&yy, y, sizeof(yy)); + int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy)); int32x4_t a0 = vmlal_s16(a, y0, x0); - - int16x4_t y4 = vld1_dup_s16(y); /* load next y */ - y0 = vext_s16(y0, y4, 1); + int16x4_t y1 = vext_s16(y0, y4, 1); + /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0, + using VSRI instead of VEXT, since it's a data-processing + instruction. */ + y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), + vreinterpret_s64_s16(y0), 32)); + int32x4_t a1 = vmlal_s16(a0, y1, x1); + a = a1; + x += 2; + y += 2; + j += 2; + } + if (j + 1 < len) { + /* Load next x. */ + int16x4_t x0 = vld1_dup_s16(x); + int32x4_t a0 = vmlal_s16(a, y0, x0); + /* Load last y. */ + int16x4_t y4 = vld1_dup_s16(y); + y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), + vreinterpret_s64_s16(y0), 16)); a = a0; x++; - y++; } - - vst1q_s32(sum, a); + /* Load last x. */ + int16x4_t x0 = vld1_dup_s16(x); + int32x4_t a0 = vmlal_s16(a, y0, x0); + vst1q_s32(sum, a0); } #else + +#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64) +/* If we can, force the compiler to use an FMA instruction rather than break + * vmlaq_f32() into fmul/fadd. */ +#ifdef vmlaq_lane_f32 +#undef vmlaq_lane_f32 +#endif +#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane) +#endif + + /* * Function: xcorr_kernel_neon_float * --------------------------------- diff --git a/media/libopus/celt/arm/pitch_neon_intr.c b/media/libopus/celt/arm/pitch_neon_intr.c index 35cc46e2c2..43885f528c 100644 --- a/media/libopus/celt/arm/pitch_neon_intr.c +++ b/media/libopus/celt/arm/pitch_neon_intr.c @@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* ========================================================================== */ +#ifdef __ARM_FEATURE_FMA +/* If we can, force the compiler to use an FMA instruction rather than break + vmlaq_f32() into fmul/fadd. */ +#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c) +#endif + + #ifdef OPUS_CHECK_ASM /* This part of code simulates floating-point NEON operations. */ diff --git a/media/libopus/celt/celt.h b/media/libopus/celt/celt.h index 24b6b2b520..2f501951d5 100644 --- a/media/libopus/celt/celt.h +++ b/media/libopus/celt/celt.h @@ -42,6 +42,10 @@ #include "entdec.h" #include "arch.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#endif + #ifdef __cplusplus extern "C" { #endif @@ -149,6 +153,13 @@ int celt_decoder_get_size(int channels); int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels); +int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ); + int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum); @@ -225,23 +236,13 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, const opus_val16 *window, int overlap, int arch); -#ifdef NON_STATIC_COMB_FILTER_CONST_C -void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12); -#endif - -#ifndef OVERRIDE_COMB_FILTER_CONST -# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ - ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) -#endif - void init_caps(const CELTMode *m,int *cap,int LM,int C); #ifdef RESYNTH -void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem); +void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum); void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[], opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient, - int LM, int downsample, int silence); + int LM, int downsample, int silence, int arch); #endif #ifdef __cplusplus diff --git a/media/libopus/celt/celt_decoder.c b/media/libopus/celt/celt_decoder.c index 883dae15d2..743c2031bc 100644 --- a/media/libopus/celt/celt_decoder.c +++ b/media/libopus/celt/celt_decoder.c @@ -51,6 +51,11 @@ #include "celt_lpc.h" #include "vq.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#include "lpcnet_private.h" +#endif + /* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The current value corresponds to a pitch of 66.67 Hz. */ @@ -59,9 +64,6 @@ pitch of 480 Hz. */ #define PLC_PITCH_LAG_MIN (100) -#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT) -#define NORM_ALIASING_HACK -#endif /**********************************************************************/ /* */ /* DECODER */ @@ -69,6 +71,9 @@ /**********************************************************************/ #define DECODE_BUFFER_SIZE 2048 +#define PLC_UPDATE_FRAMES 4 +#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE) + /** Decoder state @brief Decoder state */ @@ -82,6 +87,7 @@ struct OpusCustomDecoder { int start, end; int signalling; int disable_inv; + int complexity; int arch; /* Everything beyond this point gets cleared on a reset */ @@ -98,11 +104,18 @@ struct OpusCustomDecoder { opus_val16 postfilter_gain_old; int postfilter_tapset; int postfilter_tapset_old; + int prefilter_and_fold; celt_sig preemph_memD[2]; +#ifdef ENABLE_DEEP_PLC + opus_int16 plc_pcm[PLC_UPDATE_SAMPLES]; + int plc_fill; + float plc_preemphasis_mem; +#endif + celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */ - /* opus_val16 lpc[], Size = channels*LPC_ORDER */ + /* opus_val16 lpc[], Size = channels*CELT_LPC_ORDER */ /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */ /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */ /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */ @@ -157,7 +170,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int { int size = sizeof(struct CELTDecoder) + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig) - + channels*LPC_ORDER*sizeof(opus_val16) + + channels*CELT_LPC_ORDER*sizeof(opus_val16) + 4*2*mode->nbEBands*sizeof(opus_val16); return size; } @@ -499,7 +512,100 @@ static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch) return pitch_index; } -static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) +static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N) +{ + int c; + int CC; + int i; + int overlap; + celt_sig *decode_mem[2]; + const OpusCustomMode *mode; + VARDECL(opus_val32, etmp); + mode = st->mode; + overlap = st->overlap; + CC = st->channels; + ALLOC(etmp, overlap, opus_val32); + c=0; do { + decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap); + } while (++c<CC); + + c=0; do { + /* Apply the pre-filter to the MDCT overlap for the next frame because + the post-filter will be re-applied in the decoder after the MDCT + overlap. */ + comb_filter(etmp, decode_mem[c]+DECODE_BUFFER_SIZE-N, + st->postfilter_period_old, st->postfilter_period, overlap, + -st->postfilter_gain_old, -st->postfilter_gain, + st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch); + + /* Simulate TDAC on the concealed audio so that it blends with the + MDCT of the next frame. */ + for (i=0;i<overlap/2;i++) + { + decode_mem[c][DECODE_BUFFER_SIZE-N+i] = + MULT16_32_Q15(mode->window[i], etmp[overlap-1-i]) + + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]); + } + } while (++c<CC); +} + +#ifdef ENABLE_DEEP_PLC + +#define SINC_ORDER 48 +/* h=cos(pi/2*abs(sin([-24:24]/48*pi*23./24)).^2); + b=sinc([-24:24]/3*1.02).*h; + b=b/sum(b); */ +static const float sinc_filter[SINC_ORDER+1] = { + 4.2931e-05f, -0.000190293f, -0.000816132f, -0.000637162f, 0.00141662f, 0.00354764f, 0.00184368f, -0.00428274f, + -0.00856105f, -0.0034003f, 0.00930201f, 0.0159616f, 0.00489785f, -0.0169649f, -0.0259484f, -0.00596856f, + 0.0286551f, 0.0405872f, 0.00649994f, -0.0509284f, -0.0716655f, -0.00665212f, 0.134336f, 0.278927f, + 0.339995f, 0.278927f, 0.134336f, -0.00665212f, -0.0716655f, -0.0509284f, 0.00649994f, 0.0405872f, + 0.0286551f, -0.00596856f, -0.0259484f, -0.0169649f, 0.00489785f, 0.0159616f, 0.00930201f, -0.0034003f, + -0.00856105f, -0.00428274f, 0.00184368f, 0.00354764f, 0.00141662f, -0.000637162f, -0.000816132f, -0.000190293f, + 4.2931e-05f +}; + +void update_plc_state(LPCNetPLCState *lpcnet, celt_sig *decode_mem[2], float *plc_preemphasis_mem, int CC) +{ + int i; + int tmp_read_post, tmp_fec_skip; + int offset; + celt_sig buf48k[DECODE_BUFFER_SIZE]; + opus_int16 buf16k[PLC_UPDATE_SAMPLES]; + if (CC == 1) OPUS_COPY(buf48k, decode_mem[0], DECODE_BUFFER_SIZE); + else { + for (i=0;i<DECODE_BUFFER_SIZE;i++) { + buf48k[i] = .5*(decode_mem[0][i] + decode_mem[1][i]); + } + } + /* Down-sample the last 40 ms. */ + for (i=1;i<DECODE_BUFFER_SIZE;i++) buf48k[i] += PREEMPHASIS*buf48k[i-1]; + *plc_preemphasis_mem = buf48k[DECODE_BUFFER_SIZE-1]; + offset = DECODE_BUFFER_SIZE-SINC_ORDER-1 - 3*(PLC_UPDATE_SAMPLES-1); + celt_assert(3*(PLC_UPDATE_SAMPLES-1) + SINC_ORDER + offset == DECODE_BUFFER_SIZE-1); + for (i=0;i<PLC_UPDATE_SAMPLES;i++) { + int j; + float sum = 0; + for (j=0;j<SINC_ORDER+1;j++) { + sum += buf48k[3*i + j + offset]*sinc_filter[j]; + } + buf16k[i] = float2int(MIN32(32767.f, MAX32(-32767.f, sum))); + } + tmp_read_post = lpcnet->fec_read_pos; + tmp_fec_skip = lpcnet->fec_skip; + for (i=0;i<PLC_UPDATE_FRAMES;i++) { + lpcnet_plc_update(lpcnet, &buf16k[FRAME_SIZE*i]); + } + lpcnet->fec_read_pos = tmp_read_post; + lpcnet->fec_skip = tmp_fec_skip; +} +#endif + +static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ) { int c; int i; @@ -527,22 +633,22 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N; } while (++c<C); lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C); - oldBandE = lpc+C*LPC_ORDER; + oldBandE = lpc+C*CELT_LPC_ORDER; oldLogE = oldBandE + 2*nbEBands; oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; loss_duration = st->loss_duration; start = st->start; +#ifdef ENABLE_DEEP_PLC + noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80)); +#else noise_based = loss_duration >= 40 || start != 0 || st->skip_plc; +#endif if (noise_based) { /* Noise-based PLC/CNG */ -#ifdef NORM_ALIASING_HACK - celt_norm *X; -#else VARDECL(celt_norm, X); -#endif opus_uint32 seed; int end; int effEnd; @@ -550,18 +656,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) end = st->end; effEnd = IMAX(start, IMIN(end, mode->effEBands)); -#ifdef NORM_ALIASING_HACK - /* This is an ugly hack that breaks aliasing rules and would be easily broken, - but it saves almost 4kB of stack. */ - X = (celt_norm*)(out_syn[C-1]+overlap/2); -#else ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ -#endif c=0; do { OPUS_MOVE(decode_mem[c], decode_mem[c]+N, - DECODE_BUFFER_SIZE-N+(overlap>>1)); + DECODE_BUFFER_SIZE-N+overlap); } while (++c<C); + if (st->prefilter_and_fold) { + prefilter_and_fold(st, N); + } + /* Energy decay */ decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); c=0; do @@ -590,6 +694,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) st->rng = seed; celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch); + st->prefilter_and_fold = 0; + /* Skip regular PLC until we get two consecutive packets. */ + st->skip_plc = 1; } else { int exc_length; /* Pitch-based PLC */ @@ -597,12 +704,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val16 *exc; opus_val16 fade = Q15ONE; int pitch_index; - VARDECL(opus_val32, etmp); VARDECL(opus_val16, _exc); VARDECL(opus_val16, fir_tmp); if (loss_duration == 0) { +#ifdef ENABLE_DEEP_PLC + if (lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C); +#endif st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch); } else { pitch_index = st->last_pitch_index; @@ -613,10 +722,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) decaying signal, but we can't get more than MAX_PERIOD. */ exc_length = IMIN(2*pitch_index, MAX_PERIOD); - ALLOC(etmp, overlap, opus_val32); - ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16); + ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16); ALLOC(fir_tmp, exc_length, opus_val16); - exc = _exc+LPC_ORDER; + exc = _exc+CELT_LPC_ORDER; window = mode->window; c=0; do { opus_val16 decay; @@ -628,16 +736,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) int j; buf = decode_mem[c]; - for (i=0;i<MAX_PERIOD+LPC_ORDER;i++) - exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT); + for (i=0;i<MAX_PERIOD+CELT_LPC_ORDER;i++) + exc[i-CELT_LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-CELT_LPC_ORDER+i], SIG_SHIFT); if (loss_duration == 0) { - opus_val32 ac[LPC_ORDER+1]; + opus_val32 ac[CELT_LPC_ORDER+1]; /* Compute LPC coefficients for the last MAX_PERIOD samples before the first loss so we can work in the excitation-filter domain. */ _celt_autocorr(exc, ac, window, overlap, - LPC_ORDER, MAX_PERIOD, st->arch); + CELT_LPC_ORDER, MAX_PERIOD, st->arch); /* Add a noise floor of -40 dB. */ #ifdef FIXED_POINT ac[0] += SHR32(ac[0],13); @@ -645,7 +753,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) ac[0] *= 1.0001f; #endif /* Use lag windowing to stabilize the Levinson-Durbin recursion. */ - for (i=1;i<=LPC_ORDER;i++) + for (i=1;i<=CELT_LPC_ORDER;i++) { /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/ #ifdef FIXED_POINT @@ -654,7 +762,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) ac[i] -= ac[i]*(0.008f*0.008f)*i*i; #endif } - _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER); + _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER); #ifdef FIXED_POINT /* For fixed-point, apply bandwidth expansion until we can guarantee that no overflow can happen in the IIR filter. This means: @@ -662,13 +770,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) while (1) { opus_val16 tmp=Q15ONE; opus_val32 sum=QCONST16(1., SIG_SHIFT); - for (i=0;i<LPC_ORDER;i++) - sum += ABS16(lpc[c*LPC_ORDER+i]); + for (i=0;i<CELT_LPC_ORDER;i++) + sum += ABS16(lpc[c*CELT_LPC_ORDER+i]); if (sum < 65535) break; - for (i=0;i<LPC_ORDER;i++) + for (i=0;i<CELT_LPC_ORDER;i++) { tmp = MULT16_16_Q15(QCONST16(.99f,15), tmp); - lpc[c*LPC_ORDER+i] = MULT16_16_Q15(lpc[c*LPC_ORDER+i], tmp); + lpc[c*CELT_LPC_ORDER+i] = MULT16_16_Q15(lpc[c*CELT_LPC_ORDER+i], tmp); } } #endif @@ -678,8 +786,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) { /* Compute the excitation for exc_length samples before the loss. We need the copy because celt_fir() cannot filter in-place. */ - celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, - fir_tmp, exc_length, LPC_ORDER, st->arch); + celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER, + fir_tmp, exc_length, CELT_LPC_ORDER, st->arch); OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length); } @@ -737,15 +845,15 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) S1 += SHR32(MULT16_16(tmp, tmp), 10); } { - opus_val16 lpc_mem[LPC_ORDER]; + opus_val16 lpc_mem[CELT_LPC_ORDER]; /* Copy the last decoded samples (prior to the overlap region) to synthesis filter memory so we can have a continuous signal. */ - for (i=0;i<LPC_ORDER;i++) + for (i=0;i<CELT_LPC_ORDER;i++) lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT); /* Apply the synthesis filter to convert the excitation back into the signal domain. */ - celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER, - buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER, + celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*CELT_LPC_ORDER, + buf+DECODE_BUFFER_SIZE-N, extrapolation_len, CELT_LPC_ORDER, lpc_mem, st->arch); #ifdef FIXED_POINT for (i=0; i < extrapolation_len; i++) @@ -792,23 +900,65 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } } - /* Apply the pre-filter to the MDCT overlap for the next frame because - the post-filter will be re-applied in the decoder after the MDCT - overlap. */ - comb_filter(etmp, buf+DECODE_BUFFER_SIZE, - st->postfilter_period, st->postfilter_period, overlap, - -st->postfilter_gain, -st->postfilter_gain, - st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); - - /* Simulate TDAC on the concealed audio so that it blends with the - MDCT of the next frame. */ - for (i=0;i<overlap/2;i++) - { - buf[DECODE_BUFFER_SIZE+i] = - MULT16_32_Q15(window[i], etmp[overlap-1-i]) - + MULT16_32_Q15(window[overlap-i-1], etmp[i]); - } } while (++c<C); + +#ifdef ENABLE_DEEP_PLC + if (lpcnet->loaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) { + float overlap_mem; + int samples_needed16k; + celt_sig *buf; + VARDECL(float, buf_copy); + buf = decode_mem[0]; + ALLOC(buf_copy, C*overlap, float); + c=0; do { + OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap); + } while (++c<C); + + /* Need enough samples from the PLC to cover the frame size, resampling delay, + and the overlap at the end. */ + samples_needed16k = (N+SINC_ORDER+overlap)/3; + if (loss_duration == 0) { + st->plc_fill = 0; + } + while (st->plc_fill < samples_needed16k) { + lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]); + st->plc_fill += FRAME_SIZE; + } + /* Resample to 48 kHz. */ + for (i=0;i<(N+overlap)/3;i++) { + int j; + float sum; + for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j]; + buf[DECODE_BUFFER_SIZE-N+3*i] = sum; + for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2]; + buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum; + for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1]; + buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum; + } + OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3); + st->plc_fill -= N/3; + for (i=0;i<N;i++) { + float tmp = buf[DECODE_BUFFER_SIZE-N+i]; + buf[DECODE_BUFFER_SIZE-N+i] -= PREEMPHASIS*st->plc_preemphasis_mem; + st->plc_preemphasis_mem = tmp; + } + overlap_mem = st->plc_preemphasis_mem; + for (i=0;i<overlap;i++) { + float tmp = buf[DECODE_BUFFER_SIZE+i]; + buf[DECODE_BUFFER_SIZE+i] -= PREEMPHASIS*overlap_mem; + overlap_mem = tmp; + } + /* For now, we just do mono PLC. */ + if (C==2) OPUS_COPY(decode_mem[1], decode_mem[0], DECODE_BUFFER_SIZE+overlap); + c=0; do { + /* Cross-fade with 48-kHz non-neural PLC for the first 2.5 ms to avoid a discontinuity. */ + if (loss_duration == 0) { + for (i=0;i<overlap;i++) decode_mem[c][DECODE_BUFFER_SIZE-N+i] = (1-window[i])*buf_copy[c*overlap+i] + (window[i])*decode_mem[c][DECODE_BUFFER_SIZE-N+i]; + } + } while (++c<C); + } +#endif + st->prefilter_and_fold = 1; } /* Saturate to soemthing large to avoid wrap-around. */ @@ -817,18 +967,18 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) RESTORE_STACK; } -int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, - int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum) +int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ) { int c, i, N; int spread_decision; opus_int32 bits; ec_dec _dec; -#ifdef NORM_ALIASING_HACK - celt_norm *X; -#else VARDECL(celt_norm, X); -#endif VARDECL(int, fine_quant); VARDECL(int, pulses); VARDECL(int, cap); @@ -881,7 +1031,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat frame_size *= st->downsample; lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC); - oldBandE = lpc+CC*LPC_ORDER; + oldBandE = lpc+CC*CELT_LPC_ORDER; oldLogE = oldBandE + 2*nbEBands; oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; @@ -935,15 +1085,25 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (data == NULL || len<=1) { - celt_decode_lost(st, N, LM); + celt_decode_lost(st, N, LM +#ifdef ENABLE_DEEP_PLC + , lpcnet +#endif + ); deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); RESTORE_STACK; return frame_size/st->downsample; } +#ifdef ENABLE_DEEP_PLC + else { + /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */ + if (lpcnet) lpcnet->blend = 0; + } +#endif /* Check if there are at least two packets received consecutively before * turning on the pitch-based PLC */ - st->skip_plc = st->loss_duration != 0; + if (st->loss_duration == 0) st->skip_plc = 0; if (dec == NULL) { @@ -1006,6 +1166,36 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat /* Decode the global flags (first symbols in the stream) */ intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0; + /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the + risk of getting loud artifacts. */ + if (!intra_ener && st->loss_duration != 0) { + c=0; do + { + opus_val16 safety = 0; + int missing = IMIN(10, st->loss_duration>>LM); + if (LM==0) safety = QCONST16(1.5f,DB_SHIFT); + else if (LM==1) safety = QCONST16(.5f,DB_SHIFT); + for (i=start;i<end;i++) + { + if (oldBandE[c*nbEBands+i] < MAX16(oldLogE[c*nbEBands+i], oldLogE2[c*nbEBands+i])) { + /* If energy is going down already, continue the trend. */ + opus_val32 slope; + opus_val32 E0, E1, E2; + E0 = oldBandE[c*nbEBands+i]; + E1 = oldLogE[c*nbEBands+i]; + E2 = oldLogE2[c*nbEBands+i]; + slope = MAX32(E1 - E0, HALF32(E2 - E0)); + E0 -= MAX32(0, (1+missing)*slope); + oldBandE[c*nbEBands+i] = MAX32(-QCONST16(20.f,DB_SHIFT), E0); + } else { + /* Otherwise take the min of the last frames. */ + oldBandE[c*nbEBands+i] = MIN16(MIN16(oldBandE[c*nbEBands+i], oldLogE[c*nbEBands+i]), oldLogE2[c*nbEBands+i]); + } + /* Shorter frames have more natural fluctuations -- play it safe. */ + oldBandE[c*nbEBands+i] -= safety; + } + } while (++c<2); + } /* Get band energies */ unquant_coarse_energy(mode, start, end, oldBandE, intra_ener, dec, C, LM); @@ -1073,19 +1263,13 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C); c=0; do { - OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2); + OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap); } while (++c<CC); /* Decode fixed codebook */ ALLOC(collapse_masks, C*nbEBands, unsigned char); -#ifdef NORM_ALIASING_HACK - /* This is an ugly hack that breaks aliasing rules and would be easily broken, - but it saves almost 4kB of stack. */ - X = (celt_norm*)(out_syn[CC-1]+overlap/2); -#else ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ -#endif quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks, NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res, @@ -1109,7 +1293,9 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat for (i=0;i<C*nbEBands;i++) oldBandE[i] = -QCONST16(28.f,DB_SHIFT); } - + if (st->prefilter_and_fold) { + prefilter_and_fold(st, N); + } celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, CC, isTransient, LM, st->downsample, silence, st->arch); @@ -1173,6 +1359,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); st->loss_duration = 0; + st->prefilter_and_fold = 0; RESTORE_STACK; if (ec_tell(dec) > 8*len) return OPUS_INTERNAL_ERROR; @@ -1181,6 +1368,15 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat return frame_size/st->downsample; } +int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum) +{ + return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum +#ifdef ENABLE_DEEP_PLC + , NULL +#endif + ); +} #ifdef CUSTOM_MODES @@ -1254,6 +1450,26 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...) va_start(ap, request); switch (request) { + case OPUS_SET_COMPLEXITY_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>10) + { + goto bad_arg; + } + st->complexity = value; + } + break; + case OPUS_GET_COMPLEXITY_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->complexity; + } + break; case CELT_SET_START_BAND_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); @@ -1300,7 +1516,7 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...) int i; opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2; lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels); - oldBandE = lpc+st->channels*LPC_ORDER; + oldBandE = lpc+st->channels*CELT_LPC_ORDER; oldLogE = oldBandE + 2*st->mode->nbEBands; oldLogE2 = oldLogE + 2*st->mode->nbEBands; OPUS_CLEAR((char*)&st->DECODER_RESET_START, diff --git a/media/libopus/celt/celt_encoder.c b/media/libopus/celt/celt_encoder.c index 637d442cf7..7f32a801c6 100644 --- a/media/libopus/celt/celt_encoder.c +++ b/media/libopus/celt/celt_encoder.c @@ -281,6 +281,9 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */ for (i=0;i<len;i++) { +#ifndef FIXED_POINT + float mem00; +#endif opus_val32 x,y; x = SHR32(in[i+c*len],SIG_SHIFT); y = ADD32(mem0, x); @@ -288,8 +291,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int mem0 = mem1 + y - SHL32(x,1); mem1 = x - SHR32(y,1); #else + /* Original code: mem0 = mem1 + y - 2*x; mem1 = x - .5f*y; + Modified code to shorten dependency chains: */ + mem00=mem0; + mem0 = mem0 - x + .5f*mem1; + mem1 = x - mem00; #endif tmp[i] = SROUND16(y, 2); /*printf("%f ", tmp[i]);*/ @@ -322,10 +330,11 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int #ifdef FIXED_POINT /* FIXME: Use PSHR16() instead */ tmp[i] = mem0 + PSHR32(x2-mem0,forward_shift); + mem0 = tmp[i]; #else - tmp[i] = mem0 + MULT16_16_P15(forward_decay,x2-mem0); + mem0 = x2 + (1.f-forward_decay)*mem0; + tmp[i] = forward_decay*mem0; #endif - mem0 = tmp[i]; } mem0=0; @@ -337,11 +346,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int #ifdef FIXED_POINT /* FIXME: Use PSHR16() instead */ tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3); -#else - tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0); -#endif mem0 = tmp[i]; maxE = MAX16(maxE, mem0); +#else + mem0 = tmp[i] + 0.875f*mem0; + tmp[i] = 0.125f*mem0; + maxE = MAX16(maxE, 0.125f*mem0); +#endif } /*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/ @@ -967,7 +978,7 @@ static opus_val16 median_of_3(const opus_val16 *x) return t0; } -static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2, +static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2, const opus_val16 *oldBandE, int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN, int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM, int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc, @@ -978,9 +989,11 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 opus_val16 maxDepth; VARDECL(opus_val16, follower); VARDECL(opus_val16, noise_floor); + VARDECL(opus_val16, bandLogE3); SAVE_STACK; ALLOC(follower, C*nbEBands, opus_val16); ALLOC(noise_floor, C*nbEBands, opus_val16); + ALLOC(bandLogE3, nbEBands, opus_val16); OPUS_CLEAR(offsets, nbEBands); /* Dynamic allocation code */ maxDepth=-QCONST16(31.9f, DB_SHIFT); @@ -1033,8 +1046,10 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 printf("%d ", spread_weight[i]); printf("\n");*/ } - /* Make sure that dynamic allocation can't make us bust the budget */ - if (effectiveBytes > 50 && LM>=1 && !lfe) + /* Make sure that dynamic allocation can't make us bust the budget. + We enable the feature starting at 24 kb/s for 20-ms frames + and 96 kb/s for 2.5 ms frames. */ + if (effectiveBytes >= (30 + 5*LM) && !lfe) { int last=0; c=0;do @@ -1042,30 +1057,38 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 opus_val16 offset; opus_val16 tmp; opus_val16 *f; + OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end); + if (LM==0) { + /* For 2.5 ms frames, the first 8 bands have just one bin, so the + energy is highly unreliable (high variance). For that reason, + we take the max with the previous energy so that at least 2 bins + are getting used. */ + for (i=0;i<IMIN(8,end);i++) bandLogE3[i] = MAX16(bandLogE2[c*nbEBands+i], oldBandE[c*nbEBands+i]); + } f = &follower[c*nbEBands]; - f[0] = bandLogE2[c*nbEBands]; + f[0] = bandLogE3[0]; for (i=1;i<end;i++) { /* The last band to be at least 3 dB higher than the previous one is the last we'll consider. Otherwise, we run into problems on bandlimited signals. */ - if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT)) + if (bandLogE3[i] > bandLogE3[i-1]+QCONST16(.5f,DB_SHIFT)) last=i; - f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]); + f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE3[i]); } for (i=last-1;i>=0;i--) - f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i])); + f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE3[i])); /* Combine with a median filter to avoid dynalloc triggering unnecessarily. The "offset" value controls how conservative we are -- a higher offset reduces the impact of the median filter and makes dynalloc use more bits. */ offset = QCONST16(1.f, DB_SHIFT); for (i=2;i<end-2;i++) - f[i] = MAX16(f[i], median_of_5(&bandLogE2[c*nbEBands+i-2])-offset); - tmp = median_of_3(&bandLogE2[c*nbEBands])-offset; + f[i] = MAX16(f[i], median_of_5(&bandLogE3[i-2])-offset); + tmp = median_of_3(&bandLogE3[0])-offset; f[0] = MAX16(f[0], tmp); f[1] = MAX16(f[1], tmp); - tmp = median_of_3(&bandLogE2[c*nbEBands+end-3])-offset; + tmp = median_of_3(&bandLogE3[end-3])-offset; f[end-2] = MAX16(f[end-2], tmp); f[end-1] = MAX16(f[end-1], tmp); @@ -1565,10 +1588,13 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, vbr_rate = 0; tmp = st->bitrate*frame_size; if (tell>1) - tmp += tell; + tmp += tell*mode->Fs; if (st->bitrate!=OPUS_BITRATE_MAX) + { nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes, (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling)); + ec_enc_shrink(enc, nbCompressedBytes); + } effectiveBytes = nbCompressedBytes - nbFilledBytes; } equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50); @@ -1882,7 +1908,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, ALLOC(importance, nbEBands, int); ALLOC(spread_weight, nbEBands, int); - maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets, + maxDepth = dynalloc_analysis(bandLogE, bandLogE2, oldBandE, nbEBands, start, end, C, offsets, st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr, eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight); @@ -2246,7 +2272,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (anti_collapse_on) { anti_collapse(mode, X, collapse_masks, LM, C, N, - start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng); + start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch); } c=0; do { @@ -2265,15 +2291,15 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD); comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize, st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); if (LM!=0) comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize, st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); } while (++c<CC); /* We reuse freq[] as scratch space for the de-emphasis */ - deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD); + deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0); st->prefilter_period_old = st->prefilter_period; st->prefilter_gain_old = st->prefilter_gain; st->prefilter_tapset_old = st->prefilter_tapset; diff --git a/media/libopus/celt/celt_lpc.c b/media/libopus/celt/celt_lpc.c index f91721bcab..fabca65cb3 100644 --- a/media/libopus/celt/celt_lpc.c +++ b/media/libopus/celt/celt_lpc.c @@ -44,7 +44,7 @@ int p opus_val32 r; opus_val32 error = ac[0]; #ifdef FIXED_POINT - opus_val32 lpc[LPC_ORDER]; + opus_val32 lpc[CELT_LPC_ORDER]; #else float *lpc = _lpc; #endif @@ -158,7 +158,17 @@ void celt_fir_c( sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT); sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT); sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT); - xcorr_kernel(rnum, x+i-ord, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + { + opus_val32 sum_c[4]; + memcpy(sum_c, sum, sizeof(sum_c)); + xcorr_kernel_c(rnum, x+i-ord, sum_c, ord); +#endif + xcorr_kernel(rnum, x+i-ord, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0); + } +#endif y[i ] = SROUND16(sum[0], SIG_SHIFT); y[i+1] = SROUND16(sum[1], SIG_SHIFT); y[i+2] = SROUND16(sum[2], SIG_SHIFT); @@ -222,8 +232,17 @@ void celt_iir(const opus_val32 *_x, sum[1]=_x[i+1]; sum[2]=_x[i+2]; sum[3]=_x[i+3]; - xcorr_kernel(rden, y+i, sum, ord, arch); - +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + { + opus_val32 sum_c[4]; + memcpy(sum_c, sum, sizeof(sum_c)); + xcorr_kernel_c(rden, y+i, sum_c, ord); +#endif + xcorr_kernel(rden, y+i, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0); + } +#endif /* Patch up the result to compensate for the fact that this is an IIR */ y[i+ord ] = -SROUND16(sum[0],SIG_SHIFT); _y[i ] = sum[0]; diff --git a/media/libopus/celt/celt_lpc.h b/media/libopus/celt/celt_lpc.h index a4c5fd6ea5..97dee82f02 100644 --- a/media/libopus/celt/celt_lpc.h +++ b/media/libopus/celt/celt_lpc.h @@ -35,7 +35,7 @@ #include "x86/celt_lpc_sse.h" #endif -#define LPC_ORDER 24 +#define CELT_LPC_ORDER 24 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); diff --git a/media/libopus/celt/cpu_support.h b/media/libopus/celt/cpu_support.h index 7b5c56ca90..9f13d8aecf 100644 --- a/media/libopus/celt/cpu_support.h +++ b/media/libopus/celt/cpu_support.h @@ -35,19 +35,20 @@ (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) #include "arm/armcpu.h" -/* We currently support 4 ARM variants: +/* We currently support 5 ARM variants: * arch[0] -> ARMv4 * arch[1] -> ARMv5E * arch[2] -> ARMv6 * arch[3] -> NEON + * arch[4] -> NEON+DOTPROD */ -#define OPUS_ARCHMASK 3 +#define OPUS_ARCHMASK 7 #elif defined(OPUS_HAVE_RTCD) && \ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) #include "x86/x86cpu.h" /* We currently support 5 x86 variants: diff --git a/media/libopus/celt/entdec.c b/media/libopus/celt/entdec.c index 0b3433ed8b..027aa24bca 100644 --- a/media/libopus/celt/entdec.c +++ b/media/libopus/celt/entdec.c @@ -195,6 +195,27 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){ return ret; } +int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb){ + opus_uint32 r; + opus_uint32 d; + opus_uint32 s; + opus_uint32 t; + int ret; + s=_this->rng; + d=_this->val; + r=s>>_ftb; + ret=-1; + do{ + t=s; + s=IMUL32(r,_icdf[++ret]); + } + while(d<s); + _this->val=d-s; + _this->rng=t-s; + ec_dec_normalize(_this); + return ret; +} + opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){ unsigned ft; unsigned s; diff --git a/media/libopus/celt/entdec.h b/media/libopus/celt/entdec.h index 025fc1870d..c81f26fdb2 100644 --- a/media/libopus/celt/entdec.h +++ b/media/libopus/celt/entdec.h @@ -81,6 +81,16 @@ int ec_dec_bit_logp(ec_dec *_this,unsigned _logp); Return: The decoded symbol s.*/ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb); +/*Decodes a symbol given an "inverse" CDF table. + No call to ec_dec_update() is necessary after this call. + _icdf: The "inverse" CDF, such that symbol s falls in the range + [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution. + Return: The decoded symbol s.*/ +int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb); + /*Extracts a raw unsigned integer with a non-power-of-2 range from the stream. The bits must have been encoded with ec_enc_uint(). No call to ec_dec_update() is necessary after this call. diff --git a/media/libopus/celt/entenc.c b/media/libopus/celt/entenc.c index f1750d25b8..69c6f835d0 100644 --- a/media/libopus/celt/entenc.c +++ b/media/libopus/celt/entenc.c @@ -172,6 +172,17 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){ ec_enc_normalize(_this); } +void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb){ + opus_uint32 r; + r=_this->rng>>_ftb; + if(_s>0){ + _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]); + _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]); + } + else _this->rng-=IMUL32(r,_icdf[_s]); + ec_enc_normalize(_this); +} + void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){ unsigned ft; unsigned fl; diff --git a/media/libopus/celt/entenc.h b/media/libopus/celt/entenc.h index f502eaf662..010874bbc1 100644 --- a/media/libopus/celt/entenc.h +++ b/media/libopus/celt/entenc.h @@ -64,6 +64,15 @@ void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp); _ftb: The number of bits of precision in the cumulative distribution.*/ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb); +/*Encodes a symbol given an "inverse" CDF table. + _s: The index of the symbol to encode. + _icdf: The "inverse" CDF, such that symbol _s falls in the range + [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution.*/ +void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb); + /*Encodes a raw unsigned integer in the stream. _fl: The integer to encode. _ft: The number of integers that can be encoded (one more than the max). diff --git a/media/libopus/celt/laplace.c b/media/libopus/celt/laplace.c index a7bca874b6..2180966662 100644 --- a/media/libopus/celt/laplace.c +++ b/media/libopus/celt/laplace.c @@ -132,3 +132,104 @@ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay) ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768); return val; } + +void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay) +{ + int s; + opus_uint16 sign_icdf[3]; + sign_icdf[0] = 32768-p0; + sign_icdf[1] = sign_icdf[0]/2; + sign_icdf[2] = 0; + s = value == 0 ? 0 : (value > 0 ? 1 : 2); + ec_enc_icdf16(enc, s, sign_icdf, 15); + value = abs(value); + if (value) + { + int i; + opus_uint16 icdf[8]; + icdf[0] = IMAX(7, decay); + for (i=1;i<7;i++) + { + icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15); + } + icdf[7] = 0; + value--; + do { + ec_enc_icdf16(enc, IMIN(value, 7), icdf, 15); + value -= 7; + } while (value >= 0); + } +} + +int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay) +{ + int s; + int value; + opus_uint16 sign_icdf[3]; + sign_icdf[0] = 32768-p0; + sign_icdf[1] = sign_icdf[0]/2; + sign_icdf[2] = 0; + s = ec_dec_icdf16(dec, sign_icdf, 15); + if (s==2) s = -1; + if (s != 0) + { + int i; + int v; + opus_uint16 icdf[8]; + icdf[0] = IMAX(7, decay); + for (i=1;i<7;i++) + { + icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15); + } + icdf[7] = 0; + value = 1; + do { + v = ec_dec_icdf16(dec, icdf, 15); + value += v; + } while (v == 7); + return s*value; + } else return 0; +} + +#if 0 + +#include <stdio.h> +#define NB_VALS 10 +#define DATA_SIZE 10000 +int main() { + ec_enc enc; + ec_dec dec; + unsigned char *ptr; + int i; + int decay, p0; + int val[NB_VALS] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + /*for (i=0;i<NB_VALS;i++) { + val[i] = -log(rand()/(float)RAND_MAX); + if (rand()%2) val[i] = -val[i]; + }*/ + p0 = 16000; + decay = 16000; + ptr = (unsigned char *)malloc(DATA_SIZE); + ec_enc_init(&enc,ptr,DATA_SIZE); + for (i=0;i<NB_VALS;i++) { + printf("%d ", val[i]); + } + printf("\n"); + for (i=0;i<NB_VALS;i++) { + ec_laplace_encode_p0(&enc, val[i], p0, decay); + } + + ec_enc_done(&enc); + + ec_dec_init(&dec,ec_get_buffer(&enc),ec_range_bytes(&enc)); + + for (i=0;i<NB_VALS;i++) { + val[i] = ec_laplace_decode_p0(&dec, p0, decay); + } + for (i=0;i<NB_VALS;i++) { + printf("%d ", val[i]); + } + printf("\n"); +} + +#endif diff --git a/media/libopus/celt/laplace.h b/media/libopus/celt/laplace.h index 46c14b5da5..8010ad9755 100644 --- a/media/libopus/celt/laplace.h +++ b/media/libopus/celt/laplace.h @@ -26,6 +26,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#ifndef LAPLACE_H +#define LAPLACE_H + #include "entenc.h" #include "entdec.h" @@ -46,3 +49,9 @@ void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay); @return Value decoded */ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay); + + +int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay); +void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay); + +#endif diff --git a/media/libopus/celt/mathops.h b/media/libopus/celt/mathops.h index 478ac9187c..e2eece2937 100644 --- a/media/libopus/celt/mathops.h +++ b/media/libopus/celt/mathops.h @@ -230,6 +230,12 @@ static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x) frac = SHL16(x, 4); return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac)))))); } + +#undef D0 +#undef D1 +#undef D2 +#undef D3 + /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */ static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x) { diff --git a/media/libopus/celt/mips/celt_mipsr1.h b/media/libopus/celt/mips/celt_mipsr1.h index c332fe0471..d1b25c204d 100644 --- a/media/libopus/celt/mips/celt_mipsr1.h +++ b/media/libopus/celt/mips/celt_mipsr1.h @@ -27,8 +27,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __CELT_MIPSR1_H__ -#define __CELT_MIPSR1_H__ +#ifndef CELT_MIPSR1_H__ +#define CELT_MIPSR1_H__ #ifdef HAVE_CONFIG_H #include "config.h" @@ -149,4 +149,4 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, } } -#endif /* __CELT_MIPSR1_H__ */ +#endif /* CELT_MIPSR1_H__ */ diff --git a/media/libopus/celt/mips/mdct_mipsr1.h b/media/libopus/celt/mips/mdct_mipsr1.h index 2934dab776..7456c181a5 100644 --- a/media/libopus/celt/mips/mdct_mipsr1.h +++ b/media/libopus/celt/mips/mdct_mipsr1.h @@ -38,8 +38,8 @@ MDCT implementation in FFMPEG, but has differences in signs, ordering and scaling in many places. */ -#ifndef __MDCT_MIPSR1_H__ -#define __MDCT_MIPSR1_H__ +#ifndef MDCT_MIPSR1_H__ +#define MDCT_MIPSR1_H__ #ifndef SKIP_CONFIG_H #ifdef HAVE_CONFIG_H @@ -285,4 +285,4 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala } } } -#endif /* __MDCT_MIPSR1_H__ */ +#endif /* MDCT_MIPSR1_H__ */ diff --git a/media/libopus/celt/mips/vq_mipsr1.h b/media/libopus/celt/mips/vq_mipsr1.h index f26a33e755..1621c5624f 100644 --- a/media/libopus/celt/mips/vq_mipsr1.h +++ b/media/libopus/celt/mips/vq_mipsr1.h @@ -26,8 +26,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __VQ_MIPSR1_H__ -#define __VQ_MIPSR1_H__ +#ifndef VQ_MIPSR1_H__ +#define VQ_MIPSR1_H__ #ifdef HAVE_CONFIG_H #include "config.h" @@ -113,4 +113,4 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch) /*return celt_sqrt(E);*/ } -#endif /* __VQ_MIPSR1_H__ */ +#endif /* VQ_MIPSR1_H__ */ diff --git a/media/libopus/celt/os_support.h b/media/libopus/celt/os_support.h index 009bf861da..7d2d378116 100644 --- a/media/libopus/celt/os_support.h +++ b/media/libopus/celt/os_support.h @@ -41,7 +41,7 @@ #include <string.h> #include <stdlib.h> -/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */ +/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */ #ifndef OVERRIDE_OPUS_ALLOC static OPUS_INLINE void *opus_alloc (size_t size) { @@ -49,7 +49,15 @@ static OPUS_INLINE void *opus_alloc (size_t size) } #endif -/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */ +#ifndef OVERRIDE_OPUS_REALLOC +static OPUS_INLINE void *opus_realloc (void *ptr, size_t size) +{ + return realloc(ptr, size); +} +#endif + +/** Used only for non-threadsafe pseudostack. + If desired, this can always return the same area of memory rather than allocating a new one every time. */ #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH static OPUS_INLINE void *opus_alloc_scratch (size_t size) { @@ -58,7 +66,7 @@ static OPUS_INLINE void *opus_alloc_scratch (size_t size) } #endif -/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */ +/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */ #ifndef OVERRIDE_OPUS_FREE static OPUS_INLINE void opus_free (void *ptr) { diff --git a/media/libopus/celt/pitch.c b/media/libopus/celt/pitch.c index 7998db4164..e33c60a3bf 100644 --- a/media/libopus/celt/pitch.c +++ b/media/libopus/celt/pitch.c @@ -262,7 +262,16 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, for (i=0;i<max_pitch-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; - xcorr_kernel(_x, _y+i, sum, len, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + { + opus_val32 sum_c[4]={0,0,0,0}; + xcorr_kernel_c(_x, _y+i, sum_c, len); +#endif + xcorr_kernel(_x, _y+i, sum, len, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0); + } +#endif xcorr[i]=sum[0]; xcorr[i+1]=sum[1]; xcorr[i+2]=sum[2]; diff --git a/media/libopus/celt/pitch.h b/media/libopus/celt/pitch.h index e425f56aea..dd0e2bebd2 100644 --- a/media/libopus/celt/pitch.h +++ b/media/libopus/celt/pitch.h @@ -189,4 +189,15 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, # define celt_pitch_xcorr celt_pitch_xcorr_c #endif +#ifdef NON_STATIC_COMB_FILTER_CONST_C +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12); +#endif + +#ifndef OVERRIDE_COMB_FILTER_CONST +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) +#endif + + #endif diff --git a/media/libopus/celt/stack_alloc.h b/media/libopus/celt/stack_alloc.h index ae40e2a165..e2739bdf66 100644 --- a/media/libopus/celt/stack_alloc.h +++ b/media/libopus/celt/stack_alloc.h @@ -141,7 +141,7 @@ extern char *global_stack_top; #else #define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1)) -#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char)))) +#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/(sizeof(char))),(stack)+=(size)*(sizeof(type)/(sizeof(char))),(type*)((stack)-(size)*(sizeof(type)/(sizeof(char))))) #if 0 /* Set this to 1 to instrument pseudostack usage */ #define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack) #else diff --git a/media/libopus/celt/x86/celt_lpc_sse4_1.c b/media/libopus/celt/x86/celt_lpc_sse4_1.c index 5478568849..daf59d245a 100644 --- a/media/libopus/celt/x86/celt_lpc_sse4_1.c +++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c @@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x, { opus_val32 sums[4] = {0}; __m128i vecSum, vecX; - - xcorr_kernel(rnum, x+i-ord, sums, ord, arch); - +#if defined(OPUS_CHECK_ASM) + { + opus_val32 sums_c[4] = {0}; + xcorr_kernel_c(rnum, x+i-ord, sums_c, ord); +#endif + xcorr_kernel(rnum, x+i-ord, sums, ord, arch); +#if defined(OPUS_CHECK_ASM) + celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0); + } +#endif vecSum = _mm_loadu_si128((__m128i *)sums); vecSum = _mm_add_epi32(vecSum, vecNoA); vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); diff --git a/media/libopus/celt/x86/pitch_avx.c b/media/libopus/celt/x86/pitch_avx.c new file mode 100644 index 0000000000..f731762d84 --- /dev/null +++ b/media/libopus/celt/x86/pitch_avx.c @@ -0,0 +1,101 @@ +/* Copyright (c) 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#include <immintrin.h> +#include "x86cpu.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT) + +/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */ +static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len) +{ + __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7; + xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps(); + int i; + __m256 x0; + /* Compute 8 inner products using partial sums. */ + for (i=0;i<len-7;i+=8) + { + x0 = _mm256_loadu_ps(x+i); + xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i ), xsum0); + xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1); + xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2); + xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3); + xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4); + xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5); + xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6); + xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7); + } + if (i != len) { + static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + __m256i m; + m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len)); + x0 = _mm256_maskload_ps(x+i, m); + xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i , m), xsum0); + xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1); + xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2); + xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3); + xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4); + xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5); + xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6); + xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7); + } + /* 8 horizontal adds. */ + /* Compute [0 4] [1 5] [2 6] [3 7] */ + xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4))); + xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4))); + xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4))); + xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4))); + /* Compute [0 1 4 5] [2 3 6 7] */ + xsum0 = _mm256_hadd_ps(xsum0, xsum1); + xsum1 = _mm256_hadd_ps(xsum2, xsum3); + /* Compute [0 1 2 3 4 5 6 7] */ + xsum0 = _mm256_hadd_ps(xsum0, xsum1); + _mm256_storeu_ps(sum, xsum0); +} + +void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch) +{ + int i; + celt_assert(max_pitch>0); + (void)arch; + for (i=0;i<max_pitch-7;i+=8) + { + xcorr_kernel_avx(_x, _y+i, &xcorr[i], len); + } + for (;i<max_pitch;i++) + { + xcorr[i] = celt_inner_prod(_x, _y+i, len, arch); + } +} + +#endif diff --git a/media/libopus/celt/x86/pitch_sse.h b/media/libopus/celt/x86/pitch_sse.h index 964aef50db..127581f3e1 100644 --- a/media/libopus/celt/x86/pitch_sse.h +++ b/media/libopus/celt/x86/pitch_sse.h @@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) -#define OVERRIDE_DUAL_INNER_PROD -#define OVERRIDE_COMB_FILTER_CONST - -#undef dual_inner_prod -#undef comb_filter_const - void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, @@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y, #if defined(OPUS_X86_PRESUME_SSE) +#define OVERRIDE_DUAL_INNER_PROD +#define OVERRIDE_COMB_FILTER_CONST # define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) # define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12)) -#else +#elif defined(OPUS_HAVE_RTCD) +#define OVERRIDE_DUAL_INNER_PROD +#define OVERRIDE_COMB_FILTER_CONST extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y01, @@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( #define NON_STATIC_COMB_FILTER_CONST_C #endif -#endif + +void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch); + +#if defined(OPUS_X86_PRESUME_AVX2) + +#define OVERRIDE_PITCH_XCORR +# define celt_pitch_xcorr celt_pitch_xcorr_avx2 + +#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2) + +#define OVERRIDE_PITCH_XCORR +extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])( + const float *_x, + const float *_y, + float *xcorr, + int len, + int max_pitch, + int arch + ); + +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch)) + + +#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */ + +#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */ #endif diff --git a/media/libopus/celt/x86/vq_sse.h b/media/libopus/celt/x86/vq_sse.h index b4efe8f249..444503b630 100644 --- a/media/libopus/celt/x86/vq_sse.h +++ b/media/libopus/celt/x86/vq_sse.h @@ -28,16 +28,18 @@ #define VQ_SSE_H #if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) -#define OVERRIDE_OP_PVQ_SEARCH opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch); #if defined(OPUS_X86_PRESUME_SSE2) + +#define OVERRIDE_OP_PVQ_SEARCH #define op_pvq_search(x, iy, K, N, arch) \ (op_pvq_search_sse2(x, iy, K, N, arch)) -#else +#elif defined(OPUS_HAVE_RTCD) +#define OVERRIDE_OP_PVQ_SEARCH extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( celt_norm *_X, int *iy, int K, int N, int arch); diff --git a/media/libopus/celt/x86/vq_sse2.c b/media/libopus/celt/x86/vq_sse2.c index 775042860d..4c4ebf8e2d 100644 --- a/media/libopus/celt/x86/vq_sse2.c +++ b/media/libopus/celt/x86/vq_sse2.c @@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) sums = _mm_add_ps(sums, x4); /* Clear y and iy in case we don't do the projection. */ _mm_storeu_ps(&y[j], _mm_setzero_ps()); - _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); + _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128()); _mm_storeu_ps(&X[j], x4); _mm_storeu_ps(&signy[j], s4); } @@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) rx4 = _mm_mul_ps(x4, rcp4); iy4 = _mm_cvttps_epi32(rx4); pulses_sum = _mm_add_epi32(pulses_sum, iy4); - _mm_storeu_si128((__m128i*)&iy[j], iy4); + _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4); y4 = _mm_cvtepi32_ps(iy4); xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); @@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) { __m128i y4; __m128i s4; - y4 = _mm_loadu_si128((__m128i*)&iy[j]); + y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]); s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); - _mm_storeu_si128((__m128i*)&iy[j], y4); + _mm_storeu_si128((__m128i*)(void*)&iy[j], y4); } RESTORE_STACK; return yy; diff --git a/media/libopus/celt/x86/x86_arch_macros.h b/media/libopus/celt/x86/x86_arch_macros.h new file mode 100644 index 0000000000..975b443e93 --- /dev/null +++ b/media/libopus/celt/x86/x86_arch_macros.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER + +# ifdef OPUS_X86_MAY_HAVE_SSE +# ifndef __SSE__ +# define __SSE__ +# endif +# endif + +# ifdef OPUS_X86_MAY_HAVE_SSE2 +# ifndef __SSE2__ +# define __SSE2__ +# endif +# endif + +# ifdef OPUS_X86_MAY_HAVE_SSE4_1 +# ifndef __SSE4_1__ +# define __SSE4_1__ +# endif +# endif + +#endif diff --git a/media/libopus/celt/x86/x86_celt_map.c b/media/libopus/celt/x86/x86_celt_map.c index d39d88edec..ba8eafe6ad 100644 --- a/media/libopus/celt/x86/x86_celt_map.c +++ b/media/libopus/celt/x86/x86_celt_map.c @@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( # else +#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2) + +void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])( + const float *_x, + const float *_y, + float *xcorr, + int len, + int max_pitch, + int arch +) = { + celt_pitch_xcorr_c, /* non-sse */ + celt_pitch_xcorr_c, + celt_pitch_xcorr_c, + celt_pitch_xcorr_c, + MAY_HAVE_AVX2(celt_pitch_xcorr) +}; + +#endif + + #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( diff --git a/media/libopus/celt/x86/x86cpu.c b/media/libopus/celt/x86/x86cpu.c index 6a1914dee7..2e7c32aeec 100644 --- a/media/libopus/celt/x86/x86cpu.c +++ b/media/libopus/celt/x86/x86cpu.c @@ -39,7 +39,7 @@ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) #if defined(_MSC_VER) @@ -105,7 +105,7 @@ typedef struct CPU_Feature{ int HW_SSE2; int HW_SSE41; /* SIMD: 256-bit */ - int HW_AVX; + int HW_AVX2; } CPU_Feature; static void opus_cpu_feature_check(CPU_Feature *cpu_feature) @@ -121,13 +121,19 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0; cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; - cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0; + cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0; + if (cpu_feature->HW_AVX2 && nIds >= 7) { + cpuid(info, 7); + cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0; + } else { + cpu_feature->HW_AVX2 = 0; + } } else { cpu_feature->HW_SSE = 0; cpu_feature->HW_SSE2 = 0; cpu_feature->HW_SSE41 = 0; - cpu_feature->HW_AVX = 0; + cpu_feature->HW_AVX2 = 0; } } @@ -157,7 +163,7 @@ static int opus_select_arch_impl(void) } arch++; - if (!cpu_feature.HW_AVX) + if (!cpu_feature.HW_AVX2) { return arch; } diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h index 04e80489b1..8ae9be8d8f 100644 --- a/media/libopus/celt/x86/x86cpu.h +++ b/media/libopus/celt/x86/x86cpu.h @@ -46,28 +46,53 @@ # define MAY_HAVE_SSE4_1(name) name ## _c # endif -# if defined(OPUS_X86_MAY_HAVE_AVX) -# define MAY_HAVE_AVX(name) name ## _avx +# if defined(OPUS_X86_MAY_HAVE_AVX2) +# define MAY_HAVE_AVX2(name) name ## _avx2 # else -# define MAY_HAVE_AVX(name) name ## _c +# define MAY_HAVE_AVX2(name) name ## _c # endif -# if defined(OPUS_HAVE_RTCD) +# if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) int opus_select_arch(void); # endif +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# include "opus_defines.h" + /*MOVD should not impose any alignment restrictions, but the C standard does, and UBSan will report errors if we actually make unaligned accesses. Use this to work around those restrictions (which should hopefully all get - optimized to a single MOVD instruction).*/ -#define OP_LOADU_EPI32(x) \ - (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\ - *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U)) + optimized to a single MOVD instruction). + GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug! + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */ +# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8)) +# include <string.h> +# include <emmintrin.h> + +# ifdef _mm_loadu_si32 +# undef _mm_loadu_si32 +# endif +# define _mm_loadu_si32 WORKAROUND_mm_loadu_si32 +static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) { + int val; + memcpy(&val, mem_addr, sizeof(val)); + return _mm_cvtsi32_si128(val); +} +# elif defined(_MSC_VER) + /* MSVC needs this for _mm_loadu_si32 */ +# include <immintrin.h> +# endif -#define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x)))) +# define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_loadu_si32(x))) -#define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x)))) + +# endif #endif |