diff options
Diffstat (limited to 'media/libopus/silk')
42 files changed, 1659 insertions, 133 deletions
diff --git a/media/libopus/silk/API.h b/media/libopus/silk/API.h index 4d90ff9aa3..878965c73a 100644 --- a/media/libopus/silk/API.h +++ b/media/libopus/silk/API.h @@ -34,6 +34,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "entenc.h" #include "entdec.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet_private.h" +#endif + #ifdef __cplusplus extern "C" { @@ -88,6 +92,16 @@ opus_int silk_Encode( /* O Returns error co /* Decoder functions */ /****************************************/ + +/***********************************************/ +/* Load OSCE models from external data pointer */ +/***********************************************/ +opus_int silk_LoadOSCEModels( + void *decState, /* O I/O State */ + const unsigned char *data, /* I pointer to binary blob */ + int len /* I length of binary blob data */ +); + /***********************************************/ /* Get size in bytes of the Silk decoder state */ /***********************************************/ @@ -96,8 +110,12 @@ opus_int silk_Get_Decoder_Size( /* O Returns error co ); /*************************/ -/* Init or Reset decoder */ +/* Init and Reset decoder */ /*************************/ +opus_int silk_ResetDecoder( /* O Returns error code */ + void *decState /* I/O State */ +); + opus_int silk_InitDecoder( /* O Returns error code */ void *decState /* I/O State */ ); @@ -113,6 +131,9 @@ opus_int silk_Decode( /* O Returns error co ec_dec *psRangeDec, /* I/O Compressor data structure */ opus_int16 *samplesOut, /* O Decoded output speech vector */ opus_int32 *nSamplesOut, /* O Number of samples decoded */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ); diff --git a/media/libopus/silk/NSQ.c b/media/libopus/silk/NSQ.c index c99ec5bce5..1caa829bbe 100644 --- a/media/libopus/silk/NSQ.c +++ b/media/libopus/silk/NSQ.c @@ -80,7 +80,7 @@ void silk_NSQ_c SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ diff --git a/media/libopus/silk/NSQ_del_dec.c b/media/libopus/silk/NSQ_del_dec.c index 77f72cec3a..e8dadf1591 100644 --- a/media/libopus/silk/NSQ_del_dec.c +++ b/media/libopus/silk/NSQ_del_dec.c @@ -120,7 +120,7 @@ void silk_NSQ_del_dec_c( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ diff --git a/media/libopus/silk/PLC.c b/media/libopus/silk/PLC.c index 4667440db2..b35bf750a0 100644 --- a/media/libopus/silk/PLC.c +++ b/media/libopus/silk/PLC.c @@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" #include "PLC.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#endif + #define NB_ATT 2 static const opus_int16 HARM_ATT_Q15[NB_ATT] = { 32440, 31130 }; /* 0.99, 0.95 */ static const opus_int16 PLC_RAND_ATTENUATE_V_Q15[NB_ATT] = { 31130, 26214 }; /* 0.95, 0.8 */ @@ -47,6 +51,9 @@ static OPUS_INLINE void silk_PLC_conceal( silk_decoder_state *psDec, /* I/O Decoder state */ silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* O LPC residual signal */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ); @@ -67,6 +74,9 @@ void silk_PLC( silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* I/O signal */ opus_int lost, /* I Loss flag */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ) { @@ -80,7 +90,11 @@ void silk_PLC( /****************************/ /* Generate Signal */ /****************************/ - silk_PLC_conceal( psDec, psDecCtrl, frame, arch ); + silk_PLC_conceal( psDec, psDecCtrl, frame, +#ifdef ENABLE_DEEP_PLC + lpcnet, +#endif + arch ); psDec->lossCnt++; } else { @@ -88,6 +102,14 @@ void silk_PLC( /* Update state */ /****************************/ silk_PLC_update( psDec, psDecCtrl ); +#ifdef ENABLE_DEEP_PLC + if ( lpcnet != NULL && psDec->sPLC.fs_kHz == 16 ) { + int k; + for( k = 0; k < psDec->nb_subfr; k += 2 ) { + lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length ); + } + } +#endif } } @@ -195,6 +217,9 @@ static OPUS_INLINE void silk_PLC_conceal( silk_decoder_state *psDec, /* I/O Decoder state */ silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* O LPC residual signal */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ) { @@ -371,6 +396,24 @@ static OPUS_INLINE void silk_PLC_conceal( /* Scale with Gain */ frame[ i ] = (opus_int16)silk_SAT16( silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( sLPC_Q14_ptr[ MAX_LPC_ORDER + i ], prevGain_Q10[ 1 ] ), 8 ) ) ); } +#ifdef ENABLE_DEEP_PLC + if ( lpcnet != NULL && lpcnet->loaded && psDec->sPLC.fs_kHz == 16 ) { + int run_deep_plc = psDec->sPLC.enable_deep_plc || lpcnet->fec_fill_pos != 0; + if( run_deep_plc ) { + for( k = 0; k < psDec->nb_subfr; k += 2 ) { + lpcnet_plc_conceal( lpcnet, frame + k * psDec->subfr_length ); + } + /* We *should* be able to copy only from psDec->frame_length-MAX_LPC_ORDER, i.e. the last MAX_LPC_ORDER samples. */ + for( i = 0; i < psDec->frame_length; i++ ) { + sLPC_Q14_ptr[ MAX_LPC_ORDER + i ] = (int)floor(.5 + frame[ i ] * (float)(1 << 24) / prevGain_Q10[ 1 ] ); + } + } else { + for( k = 0; k < psDec->nb_subfr; k += 2 ) { + lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length ); + } + } + } +#endif /* Save LPC state */ silk_memcpy( psDec->sLPC_Q14_buf, &sLPC_Q14_ptr[ psDec->frame_length ], MAX_LPC_ORDER * sizeof( opus_int32 ) ); @@ -431,12 +474,16 @@ void silk_PLC_glue_frames( slope_Q16 = silk_DIV32_16( ( (opus_int32)1 << 16 ) - gain_Q16, length ); /* Make slope 4x steeper to avoid missing onsets after DTX */ slope_Q16 = silk_LSHIFT( slope_Q16, 2 ); - - for( i = 0; i < length; i++ ) { - frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] ); - gain_Q16 += slope_Q16; - if( gain_Q16 > (opus_int32)1 << 16 ) { - break; +#ifdef ENABLE_DEEP_PLC + if ( psDec->sPLC.fs_kHz != 16 ) +#endif + { + for( i = 0; i < length; i++ ) { + frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] ); + gain_Q16 += slope_Q16; + if( gain_Q16 > (opus_int32)1 << 16 ) { + break; + } } } } diff --git a/media/libopus/silk/PLC.h b/media/libopus/silk/PLC.h index 6438f51633..1bebb78638 100644 --- a/media/libopus/silk/PLC.h +++ b/media/libopus/silk/PLC.h @@ -49,6 +49,9 @@ void silk_PLC( silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* I/O signal */ opus_int lost, /* I Loss flag */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ); diff --git a/media/libopus/silk/arm/NSQ_del_dec_arm.h b/media/libopus/silk/arm/NSQ_del_dec_arm.h index 9e76e16927..0c4fcfccb4 100644 --- a/media/libopus/silk/arm/NSQ_del_dec_arm.h +++ b/media/libopus/silk/arm/NSQ_del_dec_arm.h @@ -34,7 +34,7 @@ POSSIBILITY OF SUCH DAMAGE. void silk_NSQ_del_dec_neon( const silk_encoder_state *psEncC, silk_nsq_state *NSQ, SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[], - const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER], + const opus_int16 *PredCoef_Q12, const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], @@ -65,7 +65,7 @@ void silk_NSQ_del_dec_neon( extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, silk_nsq_state *NSQ, SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[], - const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER], + const opus_int16 *PredCoef_Q12, const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], diff --git a/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c b/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c index 212410f362..668dde6dc4 100644 --- a/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c +++ b/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c @@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE. #endif #include "main.h" #include "stack_alloc.h" +#include "os_support.h" /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */ /* If there are more states, C function is called, and this optimization must be expanded. */ @@ -220,7 +221,7 @@ void silk_NSQ_del_dec_neon( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -279,6 +280,7 @@ void silk_NSQ_del_dec_neon( /* Initialize delayed decision states */ ALLOC( psDelDec, 1, NSQ_del_decs_struct ); + OPUS_CLEAR(psDelDec, 1); /* Only RandState and RD_Q10 need to be initialized to 0. */ silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) ); vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) ); @@ -587,6 +589,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( silk_assert( nStatesDelayedDecision > 0 ); silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ ALLOC( psSampleState, 2, NSQ_samples_struct ); + OPUS_CLEAR(psSampleState, 2); shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; @@ -711,23 +714,26 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( const int rdo_offset = Lambda_Q10/2 - 512; const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ); const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) ); + int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) ); + signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset ); /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */ silk_assert( Lambda_Q10 <= 32767 ); q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) ); - q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 ); - q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 ); + q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4); q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 ); } { const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) ); const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); - int16x4_t tmp1_s16x4, tmp2_s16x4; + int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4; q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 ); - tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) ); - q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) ); + tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) ); + tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 ); + tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) ); + q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4); q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 ); q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 ); q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 ); @@ -818,6 +824,13 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( } } + /* clear unused part of RD_Q10 to avoid overflows */ + if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES ) + { + OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision); + OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision); + } + /* Increase RD values of expired states */ { uint32x4_t t_u32x4; @@ -896,7 +909,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) ); vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) ); tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 ); - tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 ); + tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32( + vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) ); vst1q_s32( psDelDec->Seed, tmp1_s32x4 ); vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 ); vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) ); diff --git a/media/libopus/silk/arm/NSQ_neon.h b/media/libopus/silk/arm/NSQ_neon.h index b31d9442d6..f03d8ddd98 100644 --- a/media/libopus/silk/arm/NSQ_neon.h +++ b/media/libopus/silk/arm/NSQ_neon.h @@ -73,7 +73,7 @@ static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon(opus_int32 * #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR) #define silk_short_prediction_create_arch_coef(out, in, order) \ - do { if (arch == OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0) + do { if (arch >= OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0) #endif @@ -95,7 +95,7 @@ opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0, opus (coef vs. coefRev) so can't use the usual IMPL table implementation */ #undef silk_noise_shape_quantizer_short_prediction #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch) \ - (arch == OPUS_ARCH_ARM_NEON ? \ + (arch >= OPUS_ARCH_ARM_NEON ? \ silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) : \ silk_noise_shape_quantizer_short_prediction_c(in, coef, order)) diff --git a/media/libopus/silk/arm/arm_silk_map.c b/media/libopus/silk/arm/arm_silk_map.c index 0b9bfec2ca..a91f79b59f 100644 --- a/media/libopus/silk/arm/arm_silk_map.c +++ b/media/libopus/silk/arm/arm_silk_map.c @@ -49,6 +49,7 @@ void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])( silk_biquad_alt_stride2_c, /* EDSP */ silk_biquad_alt_stride2_c, /* Media */ silk_biquad_alt_stride2_neon, /* Neon */ + silk_biquad_alt_stride2_neon, /* dotprod */ }; opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */ @@ -59,6 +60,7 @@ opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O R silk_LPC_inverse_pred_gain_c, /* EDSP */ silk_LPC_inverse_pred_gain_c, /* Media */ silk_LPC_inverse_pred_gain_neon, /* Neon */ + silk_LPC_inverse_pred_gain_neon, /* dotprod */ }; void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( @@ -67,7 +69,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -82,6 +84,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( silk_NSQ_del_dec_c, /* EDSP */ silk_NSQ_del_dec_c, /* Media */ silk_NSQ_del_dec_neon, /* Neon */ + silk_NSQ_del_dec_neon, /* dotprod */ }; /*There is no table for silk_noise_shape_quantizer_short_prediction because the @@ -97,6 +100,7 @@ opus_int32 silk_NSQ_noise_shape_feedback_loop_c, /* EDSP */ silk_NSQ_noise_shape_feedback_loop_c, /* Media */ silk_NSQ_noise_shape_feedback_loop_neon, /* NEON */ + silk_NSQ_noise_shape_feedback_loop_neon, /* dotprod */ }; # endif @@ -116,6 +120,7 @@ void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])( silk_warped_autocorrelation_FIX_c, /* EDSP */ silk_warped_autocorrelation_FIX_c, /* Media */ silk_warped_autocorrelation_FIX_neon, /* Neon */ + silk_warped_autocorrelation_FIX_neon, /* dotprod */ }; # endif diff --git a/media/libopus/silk/control.h b/media/libopus/silk/control.h index b76ec33cd6..f5633e624e 100644 --- a/media/libopus/silk/control.h +++ b/media/libopus/silk/control.h @@ -77,6 +77,9 @@ typedef struct { /* I: Flag to enable in-band Forward Error Correction (FEC); 0/1 */ opus_int useInBandFEC; + /* I: Flag to enable in-band Deep REDundancy (DRED); 0/1 */ + opus_int useDRED; + /* I: Flag to actually code in-band Forward Error Correction (FEC) in the current packet; 0/1 */ opus_int LBRR_coded; @@ -141,6 +144,14 @@ typedef struct { /* O: Pitch lag of previous frame (0 if unvoiced), measured in samples at 48 kHz */ opus_int prevPitchLag; + + /* I: Enable Deep PLC */ + opus_int enable_deep_plc; + +#ifdef ENABLE_OSCE + /* I: OSCE method */ + opus_int osce_method; +#endif } silk_DecControlStruct; #ifdef __cplusplus diff --git a/media/libopus/silk/dec_API.c b/media/libopus/silk/dec_API.c index 7d5ca7fb9f..c1091d13ed 100644 --- a/media/libopus/silk/dec_API.c +++ b/media/libopus/silk/dec_API.c @@ -33,6 +33,11 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" #include "os_support.h" +#ifdef ENABLE_OSCE +#include "osce.h" +#include "osce_structs.h" +#endif + /************************/ /* Decoder Super Struct */ /************************/ @@ -42,12 +47,33 @@ typedef struct { opus_int nChannelsAPI; opus_int nChannelsInternal; opus_int prev_decode_only_middle; +#ifdef ENABLE_OSCE + OSCEModel osce_model; +#endif } silk_decoder; /*********************/ /* Decoder functions */ /*********************/ + + +opus_int silk_LoadOSCEModels(void *decState, const unsigned char *data, int len) +{ +#ifdef ENABLE_OSCE + opus_int ret = SILK_NO_ERROR; + + ret = osce_load_models(&((silk_decoder *)decState)->osce_model, data, len); + ((silk_decoder *)decState)->osce_model.loaded = (ret == 0); + return ret; +#else + (void) decState; + (void) data; + (void) len; + return SILK_NO_ERROR; +#endif +} + opus_int silk_Get_Decoder_Size( /* O Returns error code */ opus_int *decSizeBytes /* O Number of bytes in SILK decoder state */ ) @@ -60,12 +86,37 @@ opus_int silk_Get_Decoder_Size( /* O Returns error co } /* Reset decoder state */ +opus_int silk_ResetDecoder( /* O Returns error code */ + void *decState /* I/O State */ +) +{ + opus_int n, ret = SILK_NO_ERROR; + silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state; + + for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) { + ret = silk_reset_decoder( &channel_state[ n ] ); + } + silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo)); + /* Not strictly needed, but it's cleaner that way */ + ((silk_decoder *)decState)->prev_decode_only_middle = 0; + + return ret; +} + + opus_int silk_InitDecoder( /* O Returns error code */ void *decState /* I/O State */ ) { opus_int n, ret = SILK_NO_ERROR; silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state; +#ifdef ENABLE_OSCE + ((silk_decoder *)decState)->osce_model.loaded = 0; +#endif +#ifndef USE_WEIGHTS_FILE + /* load osce models */ + silk_LoadOSCEModels(decState, NULL, 0); +#endif for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) { ret = silk_init_decoder( &channel_state[ n ] ); @@ -86,6 +137,9 @@ opus_int silk_Decode( /* O Returns error co ec_dec *psRangeDec, /* I/O Compressor data structure */ opus_int16 *samplesOut, /* O Decoded output speech vector */ opus_int32 *nSamplesOut, /* O Number of samples decoded */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ) { @@ -278,6 +332,7 @@ opus_int silk_Decode( /* O Returns error co has_side = !psDec->prev_decode_only_middle || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 ); } + channel_state[ 0 ].sPLC.enable_deep_plc = decControl->enable_deep_plc; /* Call decoder for one frame */ for( n = 0; n < decControl->nChannelsInternal; n++ ) { if( n == 0 || has_side ) { @@ -297,7 +352,19 @@ opus_int silk_Decode( /* O Returns error co } else { condCoding = CODE_CONDITIONALLY; } - ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch); +#ifdef ENABLE_OSCE + if ( channel_state[n].osce.method != decControl->osce_method ) { + osce_reset( &channel_state[n].osce, decControl->osce_method ); + } +#endif + ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, +#ifdef ENABLE_DEEP_PLC + n == 0 ? lpcnet : NULL, +#endif +#ifdef ENABLE_OSCE + &psDec->osce_model, +#endif + arch); } else { silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) ); } diff --git a/media/libopus/silk/decode_frame.c b/media/libopus/silk/decode_frame.c index 4f36f854c2..9bc4ca2b0e 100644 --- a/media/libopus/silk/decode_frame.c +++ b/media/libopus/silk/decode_frame.c @@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" #include "PLC.h" +#ifdef ENABLE_OSCE +#include "osce.h" +#endif + /****************/ /* Decode frame */ /****************/ @@ -43,6 +47,12 @@ opus_int silk_decode_frame( opus_int32 *pN, /* O Pointer to size of output frame */ opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ opus_int condCoding, /* I The type of conditional coding to use */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif +#ifdef ENABLE_OSCE + OSCEModel *osce_model, +#endif int arch /* I Run-time architecture */ ) { @@ -61,6 +71,10 @@ opus_int silk_decode_frame( ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) ) { VARDECL( opus_int16, pulses ); +#ifdef ENABLE_OSCE + opus_int32 ec_start; + ec_start = ec_tell(psRangeDec); +#endif ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) & ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 ); /*********************************************/ @@ -84,10 +98,29 @@ opus_int silk_decode_frame( /********************************************************/ silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch ); + /*************************/ + /* Update output buffer. */ + /*************************/ + celt_assert( psDec->ltp_mem_length >= psDec->frame_length ); + mv_len = psDec->ltp_mem_length - psDec->frame_length; + silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); + silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); + +#ifdef ENABLE_OSCE + /********************************************************/ + /* Run SILK enhancer */ + /********************************************************/ + osce_enhance_frame( osce_model, psDec, psDecCtrl, pOut, ec_tell(psRangeDec) - ec_start, arch ); +#endif + /********************************************************/ /* Update PLC state */ /********************************************************/ - silk_PLC( psDec, psDecCtrl, pOut, 0, arch ); + silk_PLC( psDec, psDecCtrl, pOut, 0, +#ifdef ENABLE_DEEP_PLC + lpcnet, +#endif + arch ); psDec->lossCnt = 0; psDec->prevSignalType = psDec->indices.signalType; @@ -97,16 +130,23 @@ opus_int silk_decode_frame( psDec->first_frame_after_reset = 0; } else { /* Handle packet loss by extrapolation */ - silk_PLC( psDec, psDecCtrl, pOut, 1, arch ); - } + silk_PLC( psDec, psDecCtrl, pOut, 1, +#ifdef ENABLE_DEEP_PLC + lpcnet, +#endif + arch ); - /*************************/ - /* Update output buffer. */ - /*************************/ - celt_assert( psDec->ltp_mem_length >= psDec->frame_length ); - mv_len = psDec->ltp_mem_length - psDec->frame_length; - silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); - silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); +#ifdef ENABLE_OSCE + osce_reset( &psDec->osce, psDec->osce.method ); +#endif + /*************************/ + /* Update output buffer. */ + /*************************/ + celt_assert( psDec->ltp_mem_length >= psDec->frame_length ); + mv_len = psDec->ltp_mem_length - psDec->frame_length; + silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); + silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); + } /************************************************/ /* Comfort noise generation / estimation */ diff --git a/media/libopus/silk/enc_API.c b/media/libopus/silk/enc_API.c index 548e07364d..369caddd98 100644 --- a/media/libopus/silk/enc_API.c +++ b/media/libopus/silk/enc_API.c @@ -41,6 +41,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FLP.h" #endif +#ifdef ENABLE_DRED +#include "dred_encoder.h" +#endif + /***************************************/ /* Read control structure from encoder */ /***************************************/ diff --git a/media/libopus/silk/fixed/encode_frame_FIX.c b/media/libopus/silk/fixed/encode_frame_FIX.c index a02bf87dbb..7c83360ba3 100644 --- a/media/libopus/silk/fixed/encode_frame_FIX.c +++ b/media/libopus/silk/fixed/encode_frame_FIX.c @@ -105,8 +105,11 @@ opus_int silk_encode_frame_FIX( opus_int gain_lock[ MAX_NB_SUBFR ] = {0}; opus_int16 best_gain_mult[ MAX_NB_SUBFR ]; opus_int best_sum[ MAX_NB_SUBFR ]; + opus_int bits_margin; SAVE_STACK; + /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */ + bits_margin = useCBR ? 5 : maxBits/4; /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; @@ -282,7 +285,7 @@ opus_int silk_encode_frame_FIX( gainMult_upper = gainMult_Q8; gainsID_upper = gainsID; } - } else if( nBits < maxBits - 5 ) { + } else if( nBits < maxBits - bits_margin ) { found_lower = 1; nBits_lower = nBits; gainMult_lower = gainMult_Q8; @@ -296,7 +299,7 @@ opus_int silk_encode_frame_FIX( LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; } } else { - /* Within 5 bits of budget: close enough */ + /* Close enough */ break; } @@ -318,17 +321,10 @@ opus_int silk_encode_frame_FIX( if( ( found_lower & found_upper ) == 0 ) { /* Adjust gain according to high-rate rate/distortion curve */ if( nBits > maxBits ) { - if (gainMult_Q8 < 16384) { - gainMult_Q8 *= 2; - } else { - gainMult_Q8 = 32767; - } + gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 ); } else { - opus_int32 gain_factor_Q16; - gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); - gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 ); } - } else { /* Adjust gain by interpolating */ gainMult_Q8 = gainMult_lower + silk_DIV32_16( silk_MUL( gainMult_upper - gainMult_lower, maxBits - nBits_lower ), nBits_upper - nBits_lower ); diff --git a/media/libopus/silk/float/SigProc_FLP.h b/media/libopus/silk/float/SigProc_FLP.h index 953de8b09e..ff9281b852 100644 --- a/media/libopus/silk/float/SigProc_FLP.h +++ b/media/libopus/silk/float/SigProc_FLP.h @@ -30,6 +30,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" #include "float_cast.h" +#include "main.h" #include <math.h> #ifdef __cplusplus @@ -73,7 +74,8 @@ void silk_autocorrelation_FLP( silk_float *results, /* O result (length correlationCount) */ const silk_float *inputData, /* I input data to correlate */ opus_int inputDataSize, /* I length of input */ - opus_int correlationCount /* I number of correlation taps to compute */ + opus_int correlationCount, /* I number of correlation taps to compute */ + int arch ); opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, 1 unvoiced */ @@ -105,7 +107,8 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy const silk_float minInvGain, /* I minimum inverse prediction gain */ const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I number of subframes stacked in x */ - const opus_int D /* I order */ + const opus_int D, /* I order */ + int arch ); /* multiply a vector by a constant */ @@ -124,12 +127,17 @@ void silk_scale_copy_vector_FLP( ); /* inner product of two silk_float arrays, with result as double */ -double silk_inner_product_FLP( +double silk_inner_product_FLP_c( const silk_float *data1, const silk_float *data2, opus_int dataSize ); +#ifndef OVERRIDE_inner_product_FLP +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize)) +#endif + + /* sum of squares of a silk_float array, with result as double */ double silk_energy_FLP( const silk_float *data, diff --git a/media/libopus/silk/float/autocorrelation_FLP.c b/media/libopus/silk/float/autocorrelation_FLP.c index 8b8a9e659a..4253b26ebc 100644 --- a/media/libopus/silk/float/autocorrelation_FLP.c +++ b/media/libopus/silk/float/autocorrelation_FLP.c @@ -37,7 +37,8 @@ void silk_autocorrelation_FLP( silk_float *results, /* O result (length correlationCount) */ const silk_float *inputData, /* I input data to correlate */ opus_int inputDataSize, /* I length of input */ - opus_int correlationCount /* I number of correlation taps to compute */ + opus_int correlationCount, /* I number of correlation taps to compute */ + int arch ) { opus_int i; @@ -47,6 +48,6 @@ void silk_autocorrelation_FLP( } for( i = 0; i < correlationCount; i++ ) { - results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i ); + results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch ); } } diff --git a/media/libopus/silk/float/burg_modified_FLP.c b/media/libopus/silk/float/burg_modified_FLP.c index 756b76a35b..f5bef5ddbe 100644 --- a/media/libopus/silk/float/burg_modified_FLP.c +++ b/media/libopus/silk/float/burg_modified_FLP.c @@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy const silk_float minInvGain, /* I minimum inverse prediction gain */ const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I number of subframes stacked in x */ - const opus_int D /* I order */ + const opus_int D, /* I order */ + int arch ) { opus_int k, n, s, reached_max_gain; @@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { - C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n ); + C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch ); } } silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) ); diff --git a/media/libopus/silk/float/corrMatrix_FLP.c b/media/libopus/silk/float/corrMatrix_FLP.c index eae6a1cfca..eef6e8aa79 100644 --- a/media/libopus/silk/float/corrMatrix_FLP.c +++ b/media/libopus/silk/float/corrMatrix_FLP.c @@ -41,7 +41,8 @@ void silk_corrVector_FLP( const silk_float *t, /* I Target vector [L] */ const opus_int L, /* I Length of vecors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *Xt /* O X'*t correlation vector [order] */ + silk_float *Xt, /* O X'*t correlation vector [order] */ + int arch ) { opus_int lag; @@ -50,7 +51,7 @@ void silk_corrVector_FLP( ptr1 = &x[ Order - 1 ]; /* Points to first sample of column 0 of X: X[:,0] */ for( lag = 0; lag < Order; lag++ ) { /* Calculate X[:,lag]'*t */ - Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L ); + Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch ); ptr1--; /* Next column of X */ } } @@ -60,7 +61,8 @@ void silk_corrMatrix_FLP( const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ const opus_int L, /* I Length of vectors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *XX /* O X'*X correlation matrix [order x order] */ + silk_float *XX, /* O X'*X correlation matrix [order x order] */ + int arch ) { opus_int j, lag; @@ -79,7 +81,7 @@ void silk_corrMatrix_FLP( ptr2 = &x[ Order - 2 ]; /* First sample of column 1 of X */ for( lag = 1; lag < Order; lag++ ) { /* Calculate X[:,0]'*X[:,lag] */ - energy = silk_inner_product_FLP( ptr1, ptr2, L ); + energy = silk_inner_product_FLP( ptr1, ptr2, L, arch ); matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy; matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy; /* Calculate X[:,j]'*X[:,j + lag] */ diff --git a/media/libopus/silk/float/encode_frame_FLP.c b/media/libopus/silk/float/encode_frame_FLP.c index b029c3f5ca..8a327c5626 100644 --- a/media/libopus/silk/float/encode_frame_FLP.c +++ b/media/libopus/silk/float/encode_frame_FLP.c @@ -107,7 +107,10 @@ opus_int silk_encode_frame_FLP( opus_int gain_lock[ MAX_NB_SUBFR ] = {0}; opus_int16 best_gain_mult[ MAX_NB_SUBFR ]; opus_int best_sum[ MAX_NB_SUBFR ]; + opus_int bits_margin; + /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */ + bits_margin = useCBR ? 5 : maxBits/4; /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; @@ -270,7 +273,7 @@ opus_int silk_encode_frame_FLP( gainMult_upper = gainMult_Q8; gainsID_upper = gainsID; } - } else if( nBits < maxBits - 5 ) { + } else if( nBits < maxBits - bits_margin ) { found_lower = 1; nBits_lower = nBits; gainMult_lower = gainMult_Q8; @@ -284,7 +287,7 @@ opus_int silk_encode_frame_FLP( LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; } } else { - /* Within 5 bits of budget: close enough */ + /* Close enough */ break; } @@ -306,15 +309,9 @@ opus_int silk_encode_frame_FLP( if( ( found_lower & found_upper ) == 0 ) { /* Adjust gain according to high-rate rate/distortion curve */ if( nBits > maxBits ) { - if (gainMult_Q8 < 16384) { - gainMult_Q8 *= 2; - } else { - gainMult_Q8 = 32767; - } + gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 ); } else { - opus_int32 gain_factor_Q16; - gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); - gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 ); } } else { /* Adjust gain by interpolating */ diff --git a/media/libopus/silk/float/find_LPC_FLP.c b/media/libopus/silk/float/find_LPC_FLP.c index fa3ffe7f8b..6ccd711dc3 100644 --- a/media/libopus/silk/float/find_LPC_FLP.c +++ b/media/libopus/silk/float/find_LPC_FLP.c @@ -38,7 +38,8 @@ void silk_find_LPC_FLP( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 NLSF_Q15[], /* O NLSFs */ const silk_float x[], /* I Input signal */ - const silk_float minInvGain /* I Inverse of max prediction gain */ + const silk_float minInvGain, /* I Inverse of max prediction gain */ + int arch ) { opus_int k, subfr_length; @@ -56,12 +57,12 @@ void silk_find_LPC_FLP( psEncC->indices.NLSFInterpCoef_Q2 = 4; /* Burg AR analysis for the full frame */ - res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder ); + res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch ); if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) { /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than */ /* adding it to the residual energy of the first 10 ms in each iteration of the search below */ - res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder ); + res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch ); /* Convert to NLSFs */ silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder ); diff --git a/media/libopus/silk/float/find_LTP_FLP.c b/media/libopus/silk/float/find_LTP_FLP.c index f97064930e..90aeeac0b7 100644 --- a/media/libopus/silk/float/find_LTP_FLP.c +++ b/media/libopus/silk/float/find_LTP_FLP.c @@ -38,7 +38,8 @@ void silk_find_LTP_FLP( const silk_float r_ptr[], /* I LPC residual */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ - const opus_int nb_subfr /* I number of subframes */ + const opus_int nb_subfr, /* I number of subframes */ + int arch ) { opus_int k; @@ -50,8 +51,8 @@ void silk_find_LTP_FLP( XX_ptr = XX; for( k = 0; k < nb_subfr; k++ ) { lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 ); - silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr ); - silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr ); + silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch ); + silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch ); xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER ); temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f ); silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER ); diff --git a/media/libopus/silk/float/find_pitch_lags_FLP.c b/media/libopus/silk/float/find_pitch_lags_FLP.c index dedbcd2836..1f6bd5991c 100644 --- a/media/libopus/silk/float/find_pitch_lags_FLP.c +++ b/media/libopus/silk/float/find_pitch_lags_FLP.c @@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP( silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch ); /* Calculate autocorrelation sequence */ - silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 ); + silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch ); /* Add white noise, as a fraction of the energy */ auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1; diff --git a/media/libopus/silk/float/find_pred_coefs_FLP.c b/media/libopus/silk/float/find_pred_coefs_FLP.c index 6f79078893..f3c54cf474 100644 --- a/media/libopus/silk/float/find_pred_coefs_FLP.c +++ b/media/libopus/silk/float/find_pred_coefs_FLP.c @@ -63,7 +63,7 @@ void silk_find_pred_coefs_FLP( celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 ); /* LTP analysis */ - silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr ); + silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch ); /* Quantize LTP gain parameters */ silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, @@ -102,7 +102,7 @@ void silk_find_pred_coefs_FLP( } /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ - silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain ); + silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch ); /* Quantize LSFs */ silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 ); diff --git a/media/libopus/silk/float/inner_product_FLP.c b/media/libopus/silk/float/inner_product_FLP.c index cdd39d24ce..88b160ab40 100644 --- a/media/libopus/silk/float/inner_product_FLP.c +++ b/media/libopus/silk/float/inner_product_FLP.c @@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FLP.h" /* inner product of two silk_float arrays, with result as double */ -double silk_inner_product_FLP( +double silk_inner_product_FLP_c( const silk_float *data1, const silk_float *data2, opus_int dataSize diff --git a/media/libopus/silk/float/main_FLP.h b/media/libopus/silk/float/main_FLP.h index 5dc0ccf4a4..2e4435cc68 100644 --- a/media/libopus/silk/float/main_FLP.h +++ b/media/libopus/silk/float/main_FLP.h @@ -138,7 +138,8 @@ void silk_find_LPC_FLP( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 NLSF_Q15[], /* O NLSFs */ const silk_float x[], /* I Input signal */ - const silk_float minInvGain /* I Prediction gain from LTP (dB) */ + const silk_float minInvGain, /* I Prediction gain from LTP (dB) */ + int arch ); /* LTP analysis */ @@ -148,7 +149,8 @@ void silk_find_LTP_FLP( const silk_float r_ptr[], /* I LPC residual */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ - const opus_int nb_subfr /* I number of subframes */ + const opus_int nb_subfr, /* I number of subframes */ + int arch ); void silk_LTP_analysis_filter_FLP( @@ -221,7 +223,8 @@ void silk_corrMatrix_FLP( const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ const opus_int L, /* I Length of vectors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *XX /* O X'*X correlation matrix [order x order] */ + silk_float *XX, /* O X'*X correlation matrix [order x order] */ + int arch ); /* Calculates correlation vector X'*t */ @@ -230,7 +233,8 @@ void silk_corrVector_FLP( const silk_float *t, /* I Target vector [L] */ const opus_int L, /* I Length of vecors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *Xt /* O X'*t correlation vector [order] */ + silk_float *Xt, /* O X'*t correlation vector [order] */ + int arch ); /* Apply sine window to signal vector. */ diff --git a/media/libopus/silk/float/noise_shape_analysis_FLP.c b/media/libopus/silk/float/noise_shape_analysis_FLP.c index cb3d8a50b7..0b5ea95218 100644 --- a/media/libopus/silk/float/noise_shape_analysis_FLP.c +++ b/media/libopus/silk/float/noise_shape_analysis_FLP.c @@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP( psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); } else { /* Calculate regular auto correlation */ - silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 ); + silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch ); } /* Add white noise, as a fraction of energy */ diff --git a/media/libopus/silk/float/pitch_analysis_core_FLP.c b/media/libopus/silk/float/pitch_analysis_core_FLP.c index f351bc3718..0530a8831a 100644 --- a/media/libopus/silk/float/pitch_analysis_core_FLP.c +++ b/media/libopus/silk/float/pitch_analysis_core_FLP.c @@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, for( j = 0; j < length_d_comp; j++ ) { d = d_comp[ j ]; basis_ptr = target_ptr - d; - cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz ); + cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch ); if( cross_corr > 0.0f ) { energy = silk_energy_FLP( basis_ptr, sf_length_8kHz ); C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) ); diff --git a/media/libopus/silk/float/warped_autocorrelation_FLP.c b/media/libopus/silk/float/warped_autocorrelation_FLP.c index 09186e73d4..116dab923f 100644 --- a/media/libopus/silk/float/warped_autocorrelation_FLP.c +++ b/media/libopus/silk/float/warped_autocorrelation_FLP.c @@ -54,11 +54,13 @@ void silk_warped_autocorrelation_FLP( /* Loop over allpass sections */ for( i = 0; i < order; i += 2 ) { /* Output of allpass section */ - tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 ); + /* We voluntarily use two multiples instead of factoring the expression to + reduce the length of the dependency chain (tmp1->tmp2->tmp1... ). */ + tmp2 = state[ i ] + warping * state[ i + 1 ] - warping * tmp1; state[ i ] = tmp1; C[ i ] += state[ 0 ] * tmp1; /* Output of allpass section */ - tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 ); + tmp1 = state[ i + 1 ] + warping * state[ i + 2 ] - warping * tmp2; state[ i + 1 ] = tmp2; C[ i + 1 ] += state[ 0 ] * tmp2; } diff --git a/media/libopus/silk/float/x86/inner_product_FLP_avx2.c b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c new file mode 100644 index 0000000000..4a2daaf595 --- /dev/null +++ b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c @@ -0,0 +1,85 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. + 2023 Amazon +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" +#include <immintrin.h> + + +/* inner product of two silk_float arrays, with result as double */ +double silk_inner_product_FLP_avx2( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +) +{ + opus_int i; + __m256d accum1, accum2; + double result; + + /* 4x unrolled loop */ + result = 0.0; + accum1 = accum2 = _mm256_setzero_pd(); + for( i = 0; i < dataSize - 7; i += 8 ) { + __m128 x1f, x2f; + __m256d x1d, x2d; + x1f = _mm_loadu_ps( &data1[ i ] ); + x2f = _mm_loadu_ps( &data2[ i ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 ); + x1f = _mm_loadu_ps( &data1[ i + 4 ] ); + x2f = _mm_loadu_ps( &data2[ i + 4 ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum2 = _mm256_fmadd_pd( x1d, x2d, accum2 ); + } + for( ; i < dataSize - 3; i += 4 ) { + __m128 x1f, x2f; + __m256d x1d, x2d; + x1f = _mm_loadu_ps( &data1[ i ] ); + x2f = _mm_loadu_ps( &data2[ i ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 ); + } + accum1 = _mm256_add_pd(accum1, accum2); + accum1 = _mm256_add_pd(accum1, _mm256_permute2f128_pd(accum1, accum1, 1)); + accum1 = _mm256_hadd_pd(accum1,accum1); + result = _mm256_cvtsd_f64(accum1); + + /* add any remaining products */ + for( ; i < dataSize; i++ ) { + result += data1[ i ] * (double)data2[ i ]; + } + + return result; +} diff --git a/media/libopus/silk/init_decoder.c b/media/libopus/silk/init_decoder.c index 16c03dcd1c..01bc4b7a12 100644 --- a/media/libopus/silk/init_decoder.c +++ b/media/libopus/silk/init_decoder.c @@ -31,15 +31,21 @@ POSSIBILITY OF SUCH DAMAGE. #include "main.h" +#ifdef ENABLE_OSCE +#include "osce.h" +#endif + +#include "structs.h" + /************************/ -/* Init Decoder State */ +/* Reset Decoder State */ /************************/ -opus_int silk_init_decoder( +opus_int silk_reset_decoder( silk_decoder_state *psDec /* I/O Decoder state pointer */ ) { /* Clear the entire encoder state, except anything copied */ - silk_memset( psDec, 0, sizeof( silk_decoder_state ) ); + silk_memset( &psDec->SILK_DECODER_STATE_RESET_START, 0, sizeof( silk_decoder_state ) - ((char*) &psDec->SILK_DECODER_STATE_RESET_START - (char*)psDec) ); /* Used to deactivate LSF interpolation */ psDec->first_frame_after_reset = 1; @@ -52,6 +58,27 @@ opus_int silk_init_decoder( /* Reset PLC state */ silk_PLC_Reset( psDec ); +#ifdef ENABLE_OSCE + /* Reset OSCE state and method */ + osce_reset(&psDec->osce, OSCE_DEFAULT_METHOD); +#endif + + return 0; +} + + +/************************/ +/* Init Decoder State */ +/************************/ +opus_int silk_init_decoder( + silk_decoder_state *psDec /* I/O Decoder state pointer */ +) +{ + /* Clear the entire encoder state, except anything copied */ + silk_memset( psDec, 0, sizeof( silk_decoder_state ) ); + + silk_reset_decoder( psDec ); + return(0); } diff --git a/media/libopus/silk/init_encoder.c b/media/libopus/silk/init_encoder.c index 65995c33fa..10d41287fe 100644 --- a/media/libopus/silk/init_encoder.c +++ b/media/libopus/silk/init_encoder.c @@ -36,6 +36,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "tuning_parameters.h" #include "cpu_support.h" +#ifdef ENABLE_DRED +#include "dred_encoder.h" +#endif + /*********************************/ /* Initialize Silk Encoder state */ /*********************************/ diff --git a/media/libopus/silk/main.h b/media/libopus/silk/main.h index a5f568758f..cd576d8cc1 100644 --- a/media/libopus/silk/main.h +++ b/media/libopus/silk/main.h @@ -252,7 +252,7 @@ void silk_NSQ_c( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -278,7 +278,7 @@ void silk_NSQ_del_dec_c( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -389,6 +389,10 @@ void silk_NLSF_decode( /****************************************************/ /* Decoder Functions */ /****************************************************/ +opus_int silk_reset_decoder( + silk_decoder_state *psDec /* I/O Decoder state pointer */ +); + opus_int silk_init_decoder( silk_decoder_state *psDec /* I/O Decoder state pointer */ ); @@ -410,6 +414,12 @@ opus_int silk_decode_frame( opus_int32 *pN, /* O Pointer to size of output frame */ opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ opus_int condCoding, /* I The type of conditional coding to use */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif +#ifdef ENABLE_OSCE + OSCEModel *osce_model, +#endif int arch /* I Run-time architecture */ ); diff --git a/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h b/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h index cd70713a8f..85bfb637ef 100644 --- a/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h +++ b/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h @@ -25,8 +25,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***********************************************************************/ -#ifndef __NSQ_DEL_DEC_MIPSR1_H__ -#define __NSQ_DEL_DEC_MIPSR1_H__ +#ifndef NSQ_DEL_DEC_MIPSR1_H__ +#define NSQ_DEL_DEC_MIPSR1_H__ #ifdef HAVE_CONFIG_H #include "config.h" @@ -407,4 +407,4 @@ static inline void silk_noise_shape_quantizer_del_dec( } } -#endif /* __NSQ_DEL_DEC_MIPSR1_H__ */ +#endif /* NSQ_DEL_DEC_MIPSR1_H__ */ diff --git a/media/libopus/silk/mips/macros_mipsr1.h b/media/libopus/silk/mips/macros_mipsr1.h index 12ed981a6e..af408802c3 100644 --- a/media/libopus/silk/mips/macros_mipsr1.h +++ b/media/libopus/silk/mips/macros_mipsr1.h @@ -26,8 +26,8 @@ POSSIBILITY OF SUCH DAMAGE. ***********************************************************************/ -#ifndef __SILK_MACROS_MIPSR1_H__ -#define __SILK_MACROS_MIPSR1_H__ +#ifndef SILK_MACROS_MIPSR1_H__ +#define SILK_MACROS_MIPSR1_H__ #define mips_clz(x) __builtin_clz(x) @@ -89,4 +89,4 @@ static inline opus_int32 silk_CLZ32(opus_int32 in32) return re32; } -#endif /* __SILK_MACROS_MIPSR1_H__ */ +#endif /* SILK_MACROS_MIPSR1_H__ */ diff --git a/media/libopus/silk/structs.h b/media/libopus/silk/structs.h index 3380c757b2..38243be1ea 100644 --- a/media/libopus/silk/structs.h +++ b/media/libopus/silk/structs.h @@ -34,6 +34,21 @@ POSSIBILITY OF SUCH DAMAGE. #include "entenc.h" #include "entdec.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#include "lpcnet_private.h" +#endif + +#ifdef ENABLE_DRED +#include "dred_encoder.h" +#include "dred_decoder.h" +#endif + +#ifdef ENABLE_OSCE +#include "osce_config.h" +#include "osce_structs.h" +#endif + #ifdef __cplusplus extern "C" { @@ -228,6 +243,14 @@ typedef struct { } silk_encoder_state; +#ifdef ENABLE_OSCE +typedef struct { + OSCEFeatureState features; + OSCEState state; + int method; +} silk_OSCE_struct; +#endif + /* Struct for Packet Loss Concealment */ typedef struct { opus_int32 pitchL_Q8; /* Pitch lag to use for voiced concealment */ @@ -243,6 +266,7 @@ typedef struct { opus_int fs_kHz; opus_int nb_subfr; opus_int subfr_length; + opus_int enable_deep_plc; } silk_PLC_struct; /* Struct for CNG */ @@ -259,6 +283,10 @@ typedef struct { /* Decoder state */ /********************************/ typedef struct { +#ifdef ENABLE_OSCE + silk_OSCE_struct osce; +#endif +#define SILK_DECODER_STATE_RESET_START prev_gain_Q16 opus_int32 prev_gain_Q16; opus_int32 exc_Q14[ MAX_FRAME_LENGTH ]; opus_int32 sLPC_Q14_buf[ MAX_LPC_ORDER ]; diff --git a/media/libopus/silk/x86/NSQ_del_dec_avx2.c b/media/libopus/silk/x86/NSQ_del_dec_avx2.c new file mode 100644 index 0000000000..43485871a4 --- /dev/null +++ b/media/libopus/silk/x86/NSQ_del_dec_avx2.c @@ -0,0 +1,1075 @@ +/*********************************************************************** +Copyright (c) 2021 Google Inc. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef OPUS_CHECK_ASM +#include <string.h> +#endif + +#include "opus_defines.h" +#include <immintrin.h> + +#include "main.h" +#include "stack_alloc.h" +#include "NSQ.h" +#include "celt/x86/x86cpu.h" + +/* Returns TRUE if all assumptions met */ +static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC) +{ + /* This optimization is based on these assumptions */ + /* These assumptions are fundamental and hence assert are */ + /* used. Should any assert triggers, we have to re-visit */ + /* all related code to make sure it still functions the */ + /* same as the C implementation. */ + silk_assert(MAX_DEL_DEC_STATES <= 4 && + MAX_FRAME_LENGTH % 4 == 0 && + MAX_SUB_FRAME_LENGTH % 4 == 0 && + LTP_MEM_LENGTH_MS % 4 == 0 ); + silk_assert(psEncC->fs_kHz == 8 || + psEncC->fs_kHz == 12 || + psEncC->fs_kHz == 16 ); + silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR && + psEncC->nb_subfr > 0 ); + silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES && + psEncC->nStatesDelayedDecision > 0 ); + silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS); + + /* Regressions were observed on certain AMD Zen CPUs when */ + /* nStatesDelayedDecision is 1 or 2. Ideally we should detect */ + /* these CPUs and enable this optimization on others; however, */ + /* there is no good way to do so under current OPUS framework. */ + return psEncC->nStatesDelayedDecision == 3 || + psEncC->nStatesDelayedDecision == 4; +} + +/* Intrinsics not defined on MSVC */ +#ifdef _MSC_VER +#include <Intsafe.h> +#define __m128i_u __m128i +static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res) +{ + *res = a+b; + return (*res ^ a) & (*res ^ b) & 0x80000000; +} +static inline int __builtin_ctz(unsigned int x) +{ + DWORD res = 0; + return _BitScanForward(&res, x) ? res : 32; +} +#endif + +static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num) +{ + return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1))); +} + +static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num) +{ + num = num > silk_int16_MAX ? silk_int16_MAX : num; + num = num < silk_int16_MIN ? silk_int16_MIN : num; + return num; +} + +static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits) +{ + silk_assert(bits > 0 && bits < 31); + a += 1 << (bits-1); + return a >> bits; +} + +static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits) +{ + silk_assert(bits > 0 && bits < 63); +#ifdef OPUS_CHECK_ASM + return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); +#else + /* This code is more correct, but it won't overflow like the C code in some rare cases. */ + silk_assert(bits > 0 && bits < 63); + opus_int64 t = ((opus_int64)a) * ((opus_int64)b); + bits += 16; + t += 1ull << (bits-1); + return t >> bits; +#endif +} + +static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b) +{ + opus_int32 sum; + if (__builtin_sadd_overflow(a, b, &sum)) + { + return a >= 0 ? silk_int32_MAX : silk_int32_MIN; + } + return sum; +} + +static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits) +{ + silk_assert(bits > 0 && bits < 31); + return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits); +} + +/* add/subtract with output saturated */ +static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b) +{ + __m128i r = _mm_add_epi32(a, b); + __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r)); /* OF = (sum ^ a) & (sum ^ b) */ + __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ + return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); +} +static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b) +{ + __m128i r = _mm_sub_epi32(a, b); + __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ + __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ + return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); +} +static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b) +{ + __m256i r = _mm256_sub_epi32(a, b); + __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ + __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ + return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31)); +} + +static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2) +{ + opus_int32 lo = limit1 < limit2 ? limit1 : limit2; + opus_int32 hi = limit1 > limit2 ? limit1 : limit2; + + num = _mm_min_epi32(num, _mm_set1_epi32(hi)); + num = _mm_max_epi32(num, _mm_set1_epi32(lo)); + return num; +} + +/* cond < 0 ? -num : num */ +static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond) +{ + return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1))); +} +static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond) +{ + return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1))); +} + +/* (a32 * b32) >> 16 */ +static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b) +{ + return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16)); +} + +/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ +static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b) +{ + return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16)))); +} + +/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */ +static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b) +{ + const char FF = (char)0xFF; + __m256i msk = _mm256_set_epi8( + FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0, + FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0); + __m256i lo = _mm256_mullo_epi16(a, b); + __m256i hi = _mm256_mulhi_epi16(a, b); + lo = _mm256_shuffle_epi8(lo, msk); + hi = _mm256_shuffle_epi8(hi, msk); + return _mm256_unpacklo_epi16(lo, hi); +} + +static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v) +{ + v = _mm256_shuffle_epi32(v, 0x1B); + v = _mm256_permute4x64_epi64(v, 0x4E); + return v; +} + +static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v) +{ + __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0)); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); + return _mm_cvtsi128_si32(sum); +} + +static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num) +{ + num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */ + num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ + return num; +} + +static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num) +{ + num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */ + num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ + return num; +} + +static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask) +{ + num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask); + return silk_mm_hmin_epi32(num); +} + +static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask) +{ + num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask); + return silk_mm_hmax_epi32(num); +} + +static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed) +{ + seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER)); + seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT)); + return seed; +} + +static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b) +{ + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111; + silk_assert(mask != 0); + return __builtin_ctz(mask) >> 2; +} + +static __m128i silk_index_to_selector(opus_int32 index) +{ + silk_assert(index < 4); + index <<= 2; + return _mm_set_epi8( + index + 3, index + 2, index + 1, index + 0, + index + 3, index + 2, index + 1, index + 0, + index + 3, index + 2, index + 1, index + 0, + index + 3, index + 2, index + 1, index + 0); +} + +static opus_int32 silk_select_winner(__m128i num, __m128i selector) +{ + return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector)); +} + +typedef struct +{ + __m128i RandState; + __m128i Q_Q10; + __m128i Xq_Q14; + __m128i Pred_Q15; + __m128i Shape_Q14; +} NSQ_del_dec_sample_struct; + +typedef struct +{ + __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH]; + __m128i LF_AR_Q14; + __m128i Seed; + __m128i SeedInit; + __m128i RD_Q10; + __m128i Diff_Q14; + __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER]; + NSQ_del_dec_sample_struct Samples[DECISION_DELAY]; +} NSQ_del_dec_struct; + +static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ + const opus_int16 x16[], /* I Input */ + opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ + const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +); + +/*******************************************/ +/* LPC analysis filter */ +/* NB! State is kept internally and the */ +/* filter always starts with zero state */ +/* first d output samples are set to zero */ +/*******************************************/ +static OPUS_INLINE void silk_LPC_analysis_filter_avx2( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 order /* I Filter order */ +); + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + __m128i MaskDelDec, /* I Mask of states in decision tree */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ + opus_int decisionDelay /* I */ +); + +void silk_NSQ_del_dec_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ + const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +) +{ +#ifdef OPUS_CHECK_ASM + silk_nsq_state NSQ_c; + SideInfoIndices psIndices_c; + opus_int8 pulses_c[MAX_FRAME_LENGTH]; + const opus_int8 *const pulses_a = pulses; + + silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c)); + silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c)); + silk_memcpy(pulses_c, pulses, sizeof(pulses_c)); + silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, + pitchL, Lambda_Q10, LTP_scale_Q14); +#endif + + if (!verify_assumptions(psEncC)) + { + silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); + return; + } + + opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; + opus_int last_smple_idx, smpl_buf_idx, decisionDelay; + const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; + opus_int16 *pxq; + VARDECL(opus_int32, sLTP_Q15); + VARDECL(opus_int16, sLTP); + opus_int32 HarmShapeFIRPacked_Q14; + opus_int offset_Q10; + opus_int32 Gain_Q10; + opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH]; + opus_int32 delayedGain_Q10[DECISION_DELAY]; + NSQ_del_dec_struct psDelDec = {0}; + NSQ_del_dec_sample_struct *psSample; + __m128i RDmin_Q10, MaskDelDec, Winner_selector; + SAVE_STACK; + + MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3))); + + /* Set unvoiced lag to the previous one, overwrite later for voiced */ + lag = NSQ->lagPrev; + + silk_assert(NSQ->prev_gain_Q16 != 0); + psDelDec.Seed = _mm_and_si128( + _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)), + _mm_set1_epi32(3)); + psDelDec.SeedInit = psDelDec.Seed; + psDelDec.RD_Q10 = _mm_setzero_si128(); + psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14); + psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14); + psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]); + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]); + } + for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) + { + psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]); + } + + offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType]; + smpl_buf_idx = 0; /* index of oldest samples */ + + decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length); + + /* For voiced frames limit the decision delay to lower than the pitch lag */ + if (psIndices->signalType == TYPE_VOICED) + { + for (k = 0; k < psEncC->nb_subfr; k++) + { + decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1); + } + } + else + { + if (lag > 0) + { + decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1); + } + } + + if (psIndices->NLSFInterpCoef_Q2 == 4) + { + LSF_interpolation_flag = 0; + } + else + { + LSF_interpolation_flag = 1; + } + + ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32); + ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16); + /* Set up pointers to start of sub frame */ + pxq = &NSQ->xq[psEncC->ltp_mem_length]; + NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + subfr = 0; + for (k = 0; k < psEncC->nb_subfr; k++) + { + A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER]; + B_Q14 = <PCoef_Q14[k * LTP_ORDER]; + AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER]; + + /* Noise shape parameters */ + silk_assert(HarmShapeGain_Q14[k] >= 0); + HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); + HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); + + NSQ->rewhite_flag = 0; + if (psIndices->signalType == TYPE_VOICED) + { + /* Voiced */ + lag = pitchL[k]; + + /* Re-whitening */ + if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0) + { + if (k == 2) + { + /* RESET DELAYED DECISIONS */ + /* Find winner */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); + Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10); + Winner_selector = silk_index_to_selector(Winner_ind); + psDelDec.RD_Q10 = _mm_add_epi32( + psDelDec.RD_Q10, + _mm_blendv_epi8( + _mm_set1_epi32(silk_int32_MAX >> 4), + _mm_setzero_si128(), + _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3))))); + + /* Copy final part of signals from winner state to output and long-term filter states */ + last_smple_idx = smpl_buf_idx + decisionDelay; + for (i = 0; i < decisionDelay; i++) + { + last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; + psSample = &psDelDec.Samples[last_smple_idx]; + pulses[i - decisionDelay] = + (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); + pxq[i - decisionDelay] = + silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14)); + NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = + silk_select_winner(psSample->Shape_Q14, Winner_selector); + } + + subfr = 0; + } + + /* Rewhiten with new A coefs */ + start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; + silk_assert(start_idx > 0); + + silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length], + A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder); + + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + NSQ->rewhite_flag = 1; + } + } + + silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, + LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay); + + silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, + delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k], + Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, + psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay); + + x16 += psEncC->subfr_length; + pulses += psEncC->subfr_length; + pxq += psEncC->subfr_length; + } + + /* Find winner */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); + Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10)); + + /* Copy final part of signals from winner state to output and long-term filter states */ + psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector); + last_smple_idx = smpl_buf_idx + decisionDelay; + Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; + for (i = 0; i < decisionDelay; i++) + { + last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; + psSample = &psDelDec.Samples[last_smple_idx]; + + pulses[i - decisionDelay] = + (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); + pxq[i - decisionDelay] = + silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8)); + NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = + silk_select_winner(psSample->Shape_Q14, Winner_selector); + } + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector); + } + for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) + { + NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector); + } + + /* Update states */ + NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector); + NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector); + NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1]; + + /* Save quantized speech signal */ + silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16)); + silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32)); + +#ifdef OPUS_CHECK_ASM + silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c))); + silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c))); + silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c))); +#endif + + RESTORE_STACK; +} + +static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order) +{ + __m256i out; + silk_assert(order == 10 || order == 16); + + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + out = _mm256_set1_epi32(order >> 1); + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */ + + if (order == 16) + { + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */ + } + return silk_cvtepi64_epi32_high(out); +} + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + __m128i MaskDelDec, /* I Mask of states in decision tree */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ + opus_int decisionDelay /* I */ +) +{ + int i; + opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2]; + opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2]; + opus_int32 Gain_Q10 = Gain_Q16 >> 6; + + for (i = 0; i < length; i++) + { + /* Perform common calculations used in all states */ + /* NSQ_sample_struct */ + /* Low 128 bits => 1st set */ + /* High 128 bits => 2nd set */ + int j; + __m256i SS_Q_Q10; + __m256i SS_RD_Q10; + __m256i SS_xq_Q14; + __m256i SS_LF_AR_Q14; + __m256i SS_Diff_Q14; + __m256i SS_sLTP_shp_Q14; + __m256i SS_LPC_exc_Q14; + __m256i exc_Q14; + __m256i q_Q10, rr_Q10, rd_Q10; + __m256i mask; + __m128i LPC_pred_Q14, n_AR_Q14; + __m128i RDmin_Q10, RDmax_Q10; + __m128i n_LF_Q14; + __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10; + __m128i Winner_rand_state, Winner_selector; + __m128i tmp0, tmp1; + NSQ_del_dec_sample_struct *psLastSample, *psSample; + opus_int32 RDmin_ind, RDmax_ind, last_smple_idx; + opus_int32 LTP_pred_Q14, n_LTP_Q14; + + /* Long-term prediction */ + if (signalType == TYPE_VOICED) + { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q14 = 2; + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]); + LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */ + pred_lag_ptr++; + } + else + { + LTP_pred_Q14 = 0; + } + + /* Long-term shaping */ + if (lag > 0) + { + /* Symmetric, packed FIR coefficients */ + n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]); + n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14); + n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14); + n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */ + shp_lag_ptr++; + } + else + { + n_LTP_Q14 = 0; + } + + /* BEGIN Updating Delayed Decision States */ + + /* Generate dither */ + psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed); + + /* Short-term prediction */ + LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder); + LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */ + + /* Noise shape feedback */ + silk_assert(shapingLPCOrder > 0); + silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */ + /* Output of lowpass section */ + tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16)); + n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1); + for (j = 0; j < shapingLPCOrder - 1; j++) + { + /* Output of allpass section */ + tmp1 = psDelDec->sAR2_Q14[j]; + psDelDec->sAR2_Q14[j] = tmp0; + n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j])); + tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16)); + } + psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0; + n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1])); + + n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1); /* Q11 -> Q12 */ + n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */ + n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2); /* Q12 -> Q14 */ + + tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */ + tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16); /* Q12 */ + n_LF_Q14 = _mm_add_epi32(tmp0, tmp1); /* Q12 */ + n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2); /* Q12 -> Q14 */ + + /* Input minus prediction plus noise feedback */ + /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ + tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14); /* Q14 */ + tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */ + tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0); /* Q13 */ + tmp0 = silk_mm_srai_round_epi32(tmp0, 4); /* Q10 */ + + r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */ + + /* Flip sign depending on dither */ + r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed); + r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10); + + /* Find two quantization level candidates and measure their rate-distortion */ + q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10)); + q1_Q0 = _mm_srai_epi32(q1_Q10, 10); + if (Lambda_Q10 > 2048) + { + /* For aggressive RDO, the bias becomes more than one pulse. */ + tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */ + q1_Q0 = _mm_srai_epi32(q1_Q10, 31); + tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128()); + tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10); + q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1); + } + + tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0); + q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0); + q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10)); + + /* check if q1_Q0 is 0 or -1 */ + tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0); + tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128()); + tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1); + q2_Q10 = _mm_add_epi32(q1_Q10, tmp0); + q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10); + + rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10); + rd_Q10 = _mm256_abs_epi32(q_Q10); + rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10); + rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10)); + rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10); + rd_Q10 = _mm256_srai_epi32(rd_Q10, 10); + + mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1))); + SS_RD_Q10 = _mm256_add_epi32( + _mm256_broadcastsi128_si256(psDelDec->RD_Q10), + _mm256_blendv_epi8( + _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1), + rd_Q10, + mask)); + SS_Q_Q10 = _mm256_blendv_epi8( + _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1), + q_Q10, + mask); + + /* Update states for best and second best quantization */ + + /* Quantized excitation */ + exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed)); + + /* Add predictions */ + exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14)); + SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1); + SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14)); + + /* Update states */ + SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4))); + SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14)); + SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14)); + + /* END Updating Delayed Decision States */ + + *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY; + last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY; + psLastSample = &psDelDec->Samples[last_smple_idx]; + + /* Find winner */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec); + Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10))); + + /* Increase RD values of expired states */ + Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector); + + SS_RD_Q10 = _mm256_blendv_epi8( + _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)), + SS_RD_Q10, + _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state))); + + /* find worst in first set */ + RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec); + /* find best in second set */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec); + + /* Replace a state if best from second set outperforms worst in first set */ + tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10); + if (!_mm_test_all_zeros(tmp0, tmp0)) + { + int t; + RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0)); + RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1)); + tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3))); + tmp0 = _mm_blendv_epi8( + _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0), + silk_index_to_selector(RDmin_ind), + tmp1); + for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++) + { + psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0); + } + psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0); + psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0); + for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++) + { + psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0); + } + for (t = 0; t < DECISION_DELAY; t++) + { + psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0); + psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0); + psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0); + psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0); + psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0); + } + mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1)); + SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask); + SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask); + SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask); + SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask); + SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask); + SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask); + SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask); + } + + /* Write samples from winner to output and long-term filter states */ + if (subfr > 0 || i >= decisionDelay) + { + pulses[i - decisionDelay] = + (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10); + xq[i - decisionDelay] = + silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8)); + NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] = + silk_select_winner(psLastSample->Shape_Q14, Winner_selector); + sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] = + silk_select_winner(psLastSample->Pred_Q15, Winner_selector); + } + NSQ->sLTP_shp_buf_idx++; + NSQ->sLTP_buf_idx++; + + /* Update states */ + psSample = &psDelDec->Samples[*smpl_buf_idx]; + psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10)); + psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14); + psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14); + psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14); + psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10); + psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14); + psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10); + psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14); + psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14); + psSample->RandState = psDelDec->Seed; + delayedGain_Q10[*smpl_buf_idx] = Gain_Q10; + } + /* Update LPC states */ + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i]; + } +} + +static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ + const opus_int16 x16[], /* I Input */ + opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ + const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +) +{ + int i; + opus_int lag; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; + NSQ_del_dec_sample_struct *psSample; + + lag = pitchL[subfr]; + inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47); + silk_assert(inv_gain_Q31 != 0); + + /* Scale input */ + inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5); + for (i = 0; i < psEncC->subfr_length; i+=4) + { + __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i])); + x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16); + _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); + } + + /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ + if (NSQ->rewhite_flag) + { + if (subfr == 0) + { + /* Do LTP downscaling */ + inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2); + } + for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++) + { + silk_assert(i < MAX_FRAME_LENGTH); + sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]); + } + } + + /* Adjust for changing gain */ + if (Gains_Q16[subfr] != NSQ->prev_gain_Q16) + { + gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16); + + /* Scale long-term shaping state */ + for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4) + { + __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i]; + *p = silk_mm_smulww_epi32(*p, gain_adj_Q16); + } + + /* Scale long-term prediction state */ + if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0) + { + for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++) + { + sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16; + } + } + + /* Scale scalar states */ + psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16); + psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16); + + /* Scale short-term prediction and shaping states */ + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16); + } + for (i = 0; i < DECISION_DELAY; i++) + { + psSample = &psDelDec->Samples[i]; + psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16); + psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16); + } + for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) + { + psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16); + } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[subfr]; + } +} + +static OPUS_INLINE void silk_LPC_analysis_filter_avx2( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 order /* I Filter order */ +) +{ + int i; + opus_int32 out32_Q12, out32; + silk_assert(order == 10 || order == 16); + + for(i = order; i < len; i++ ) + { + const opus_int16 *in_ptr = &in[ i ]; + /* Allowing wrap around so that two wraps can cancel each other. The rare + cases where the result wraps around can only be triggered by invalid streams*/ + + __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8])); + __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)& B[0])); + __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v)); + if (order > 10) + { + in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16])); + B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B [8])); + B_v = silk_mm256_reverse_epi32(B_v); + } + else + { + in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10])); + B_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B [8])); + B_v = _mm256_shuffle_epi32(B_v, 0x01); + } + sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v)); + + out32_Q12 = silk_mm256_hsum_epi32(sum); + + /* Subtract prediction */ + out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 ); + + /* Scale to Q0 */ + out32 = silk_sar_round_32(out32_Q12, 12); + + /* Saturate output */ + out[ i ] = silk_sat16(out32); + } + + /* Set first d output samples to zero */ + silk_memset( out, 0, order * sizeof( opus_int16 ) ); +} diff --git a/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c b/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c index 3521cae703..5937682d2a 100644 --- a/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c +++ b/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c @@ -119,7 +119,7 @@ void silk_NSQ_del_dec_sse4_1( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -428,7 +428,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( LTP_pred_Q14 = 2; { __m128i tmpa, tmpb, pred_lag_ptr_tmp; - pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); + pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) ); pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B ); tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 ); tmpa = _mm_srli_si128( tmpa, 2 ); @@ -483,7 +483,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( tmpb = _mm_setzero_si128(); /* step 1 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */ + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */ psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */ tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */ @@ -497,7 +497,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); /* step 2 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) ); + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 ); tmpa = _mm_srli_epi64( tmpa, 16 ); @@ -512,7 +512,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( if ( opus_likely( predictLPCOrder == 16 ) ) { /* step 3 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) ); + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB ); tmpa = _mm_srli_epi64( tmpa, 16 ); @@ -525,7 +525,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); /* step 4 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) ); + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF ); tmpa = _mm_srli_epi64( tmpa, 16 ); @@ -830,7 +830,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); } for( ; i < psEncC->subfr_length; i++ ) { @@ -862,7 +862,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) { - xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); + xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); /* equal shift right 4 bytes*/ xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -874,7 +874,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); } for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { diff --git a/media/libopus/silk/x86/NSQ_sse4_1.c b/media/libopus/silk/x86/NSQ_sse4_1.c index d5ae1d3b1c..3c9aca7ba1 100644 --- a/media/libopus/silk/x86/NSQ_sse4_1.c +++ b/media/libopus/silk/x86/NSQ_sse4_1.c @@ -77,7 +77,7 @@ void silk_NSQ_sse4_1( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -338,21 +338,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); /* load a_Q12[0] - a_Q12[7] */ - a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) ); + a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) ); /* load a_Q12[ 8 ] - a_Q12[ 15 ] */ - a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) ); + a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) ); a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one ); a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one ); /* load AR_shp_Q13 */ - AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) ); + AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) ); /* load psLPC_Q14 */ xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 ); - xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) ); - xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) ); + xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) ); + xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) ); xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); @@ -360,8 +360,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); - xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) ); - xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) ); + xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) ); + xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) ); xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); @@ -370,8 +370,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); /* load sAR2_Q14 */ - xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) ); - xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) ); + xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) ); + xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) ); xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); @@ -443,7 +443,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B ); /* loaded: [0] [-1] [-2] [-3] */ - pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); + pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) ); /* shuffle to [-3] [-2] [-1] [0] and to new xmm */ xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B ); /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */ @@ -595,8 +595,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* write back sAR2_Q14 */ xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); - _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); - _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); + _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); + _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */ { @@ -612,8 +612,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* process xq */ for (i = 0; i < length - 7; i += 8) { - xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) ); - xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) ); + xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) ); + xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) ); /* equal shift right 4 bytes*/ xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -644,7 +644,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 ); /* save to xq */ - _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 ); + _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 ); } } for ( ; i < length; i++) @@ -698,7 +698,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); } for( ; i < psEncC->subfr_length; i++ ) { @@ -729,7 +729,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) { - xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); + xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); /* equal shift right 4 bytes*/ xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -741,7 +741,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); } for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { diff --git a/media/libopus/silk/x86/VAD_sse4_1.c b/media/libopus/silk/x86/VAD_sse4_1.c index e7eaf9714a..9e06bc79d0 100644 --- a/media/libopus/silk/x86/VAD_sse4_1.c +++ b/media/libopus/silk/x86/VAD_sse4_1.c @@ -144,7 +144,7 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s for( i = 0; i < dec_subframe_length - 7; i += 8 ) { - xmm_X = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) ); + xmm_X = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) ); xmm_X = _mm_srai_epi16( xmm_X, 3 ); xmm_X = _mm_madd_epi16( xmm_X, xmm_X ); xmm_acc = _mm_add_epi32( xmm_acc, xmm_X ); diff --git a/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c b/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c index 2c7d18d05e..df4626b60a 100644 --- a/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c +++ b/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c @@ -65,7 +65,7 @@ void silk_VQ_WMat_EC_sse4_1( neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 ); neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 ); - v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) ); + v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(void*)(&XX_Q17[ 1 ] ) ); v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* Loop over codebook */ diff --git a/media/libopus/silk/x86/main_sse.h b/media/libopus/silk/x86/main_sse.h index a01d7f6c75..b254d53e7a 100644 --- a/media/libopus/silk/x86/main_sse.h +++ b/media/libopus/silk/x86/main_sse.h @@ -88,7 +88,7 @@ void silk_NSQ_sse4_1( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -116,7 +116,7 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -142,7 +142,7 @@ void silk_NSQ_del_dec_sse4_1( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -154,7 +154,33 @@ void silk_NSQ_del_dec_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -# if defined OPUS_X86_PRESUME_SSE4_1 +void silk_NSQ_del_dec_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ + const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +); + +# if defined (OPUS_X86_PRESUME_AVX2) + +# define OVERRIDE_silk_NSQ_del_dec +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +# elif defined (OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) # define OVERRIDE_silk_NSQ_del_dec # define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ @@ -170,7 +196,7 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -243,5 +269,31 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( # endif +#ifndef FIXED_POINT +double silk_inner_product_FLP_avx2( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +); + +#if defined (OPUS_X86_PRESUME_AVX2) + +#define OVERRIDE_inner_product_FLP +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize)) + +#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2) + +#define OVERRIDE_inner_product_FLP +extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +); + +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize)) + +#endif +#endif + # endif #endif diff --git a/media/libopus/silk/x86/x86_silk_map.c b/media/libopus/silk/x86/x86_silk_map.c index 70f60078cf..39ad75276c 100644 --- a/media/libopus/silk/x86/x86_silk_map.c +++ b/media/libopus/silk/x86/x86_silk_map.c @@ -32,10 +32,13 @@ #include "celt/x86/x86cpu.h" #include "structs.h" #include "SigProc_FIX.h" +#ifndef FIXED_POINT +#include "SigProc_FLP.h" +#endif #include "pitch.h" #include "main.h" -#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_SSE4_1) +#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_AVX2) #if defined(FIXED_POINT) @@ -72,7 +75,7 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -117,7 +120,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -132,7 +135,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( silk_NSQ_del_dec_c, silk_NSQ_del_dec_c, MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ - MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */ + MAY_HAVE_AVX2( silk_NSQ_del_dec ) /* avx */ }; #if defined(FIXED_POINT) @@ -156,4 +159,21 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( }; #endif + +#ifndef FIXED_POINT + +double (*const SILK_INNER_PRODUCT_FLP_IMPL[ OPUS_ARCHMASK + 1 ] )( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +) = { + silk_inner_product_FLP_c, /* non-sse */ + silk_inner_product_FLP_c, + silk_inner_product_FLP_c, + silk_inner_product_FLP_c, /* sse4.1 */ + MAY_HAVE_AVX2( silk_inner_product_FLP ) /* avx */ +}; + +#endif + #endif |