From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 19 Apr 2024 03:14:29 +0200
Subject: Merging upstream version 125.0.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 media/libopus/silk/API.h                           |   23 +-
 media/libopus/silk/NSQ.c                           |    2 +-
 media/libopus/silk/NSQ_del_dec.c                   |    2 +-
 media/libopus/silk/PLC.c                           |   61 +-
 media/libopus/silk/PLC.h                           |    3 +
 media/libopus/silk/arm/NSQ_del_dec_arm.h           |    4 +-
 media/libopus/silk/arm/NSQ_del_dec_neon_intr.c     |   28 +-
 media/libopus/silk/arm/NSQ_neon.h                  |    4 +-
 media/libopus/silk/arm/arm_silk_map.c              |    7 +-
 media/libopus/silk/control.h                       |   11 +
 media/libopus/silk/dec_API.c                       |   69 +-
 media/libopus/silk/decode_frame.c                  |   60 +-
 media/libopus/silk/enc_API.c                       |    4 +
 media/libopus/silk/fixed/encode_frame_FIX.c        |   18 +-
 media/libopus/silk/float/SigProc_FLP.h             |   14 +-
 media/libopus/silk/float/autocorrelation_FLP.c     |    5 +-
 media/libopus/silk/float/burg_modified_FLP.c       |    5 +-
 media/libopus/silk/float/corrMatrix_FLP.c          |   10 +-
 media/libopus/silk/float/encode_frame_FLP.c        |   17 +-
 media/libopus/silk/float/find_LPC_FLP.c            |    7 +-
 media/libopus/silk/float/find_LTP_FLP.c            |    7 +-
 media/libopus/silk/float/find_pitch_lags_FLP.c     |    2 +-
 media/libopus/silk/float/find_pred_coefs_FLP.c     |    4 +-
 media/libopus/silk/float/inner_product_FLP.c       |    2 +-
 media/libopus/silk/float/main_FLP.h                |   12 +-
 .../libopus/silk/float/noise_shape_analysis_FLP.c  |    2 +-
 media/libopus/silk/float/pitch_analysis_core_FLP.c |    2 +-
 .../silk/float/warped_autocorrelation_FLP.c        |    6 +-
 .../silk/float/x86/inner_product_FLP_avx2.c        |   85 ++
 media/libopus/silk/init_decoder.c                  |   33 +-
 media/libopus/silk/init_encoder.c                  |    4 +
 media/libopus/silk/main.h                          |   14 +-
 media/libopus/silk/mips/NSQ_del_dec_mipsr1.h       |    6 +-
 media/libopus/silk/mips/macros_mipsr1.h            |    6 +-
 media/libopus/silk/structs.h                       |   28 +
 media/libopus/silk/x86/NSQ_del_dec_avx2.c          | 1075 ++++++++++++++++++++
 media/libopus/silk/x86/NSQ_del_dec_sse4_1.c        |   18 +-
 media/libopus/silk/x86/NSQ_sse4_1.c                |   38 +-
 media/libopus/silk/x86/VAD_sse4_1.c                |    2 +-
 media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c         |    2 +-
 media/libopus/silk/x86/main_sse.h                  |   62 +-
 media/libopus/silk/x86/x86_silk_map.c              |   28 +-
 42 files changed, 1659 insertions(+), 133 deletions(-)
 create mode 100644 media/libopus/silk/float/x86/inner_product_FLP_avx2.c
 create mode 100644 media/libopus/silk/x86/NSQ_del_dec_avx2.c

(limited to 'media/libopus/silk')

diff --git a/media/libopus/silk/API.h b/media/libopus/silk/API.h
index 4d90ff9aa3..878965c73a 100644
--- a/media/libopus/silk/API.h
+++ b/media/libopus/silk/API.h
@@ -34,6 +34,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "entenc.h"
 #include "entdec.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet_private.h"
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -88,6 +92,16 @@ opus_int silk_Encode(                                   /* O    Returns error co
 /* Decoder functions                    */
 /****************************************/
 
+
+/***********************************************/
+/* Load OSCE models from external data pointer */
+/***********************************************/
+opus_int silk_LoadOSCEModels(
+    void *decState,                                     /* O    I/O State                                       */
+    const unsigned char *data,                          /* I    pointer to binary blob                          */
+    int len                                             /* I    length of binary blob data                      */
+);
+
 /***********************************************/
 /* Get size in bytes of the Silk decoder state */
 /***********************************************/
@@ -96,8 +110,12 @@ opus_int silk_Get_Decoder_Size(                         /* O    Returns error co
 );
 
 /*************************/
-/* Init or Reset decoder */
+/* Init and Reset decoder */
 /*************************/
+opus_int silk_ResetDecoder(                              /* O    Returns error code                              */
+    void                            *decState            /* I/O  State                                           */
+);
+
 opus_int silk_InitDecoder(                              /* O    Returns error code                              */
     void                            *decState           /* I/O  State                                           */
 );
@@ -113,6 +131,9 @@ opus_int silk_Decode(                                   /* O    Returns error co
     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
     opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                  *lpcnet,
+#endif
     int                             arch                /* I    Run-time architecture                           */
 );
 
diff --git a/media/libopus/silk/NSQ.c b/media/libopus/silk/NSQ.c
index c99ec5bce5..1caa829bbe 100644
--- a/media/libopus/silk/NSQ.c
+++ b/media/libopus/silk/NSQ.c
@@ -80,7 +80,7 @@ void silk_NSQ_c
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
diff --git a/media/libopus/silk/NSQ_del_dec.c b/media/libopus/silk/NSQ_del_dec.c
index 77f72cec3a..e8dadf1591 100644
--- a/media/libopus/silk/NSQ_del_dec.c
+++ b/media/libopus/silk/NSQ_del_dec.c
@@ -120,7 +120,7 @@ void silk_NSQ_del_dec_c(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
diff --git a/media/libopus/silk/PLC.c b/media/libopus/silk/PLC.c
index 4667440db2..b35bf750a0 100644
--- a/media/libopus/silk/PLC.c
+++ b/media/libopus/silk/PLC.c
@@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "stack_alloc.h"
 #include "PLC.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#endif
+
 #define NB_ATT 2
 static const opus_int16 HARM_ATT_Q15[NB_ATT]              = { 32440, 31130 }; /* 0.99, 0.95 */
 static const opus_int16 PLC_RAND_ATTENUATE_V_Q15[NB_ATT]  = { 31130, 26214 }; /* 0.95, 0.8 */
@@ -47,6 +51,9 @@ static OPUS_INLINE void silk_PLC_conceal(
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* O LPC residual signal    */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I  Run-time architecture */
 );
 
@@ -67,6 +74,9 @@ void silk_PLC(
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* I/O  signal              */
     opus_int                            lost,               /* I Loss flag              */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I Run-time architecture  */
 )
 {
@@ -80,7 +90,11 @@ void silk_PLC(
         /****************************/
         /* Generate Signal          */
         /****************************/
-        silk_PLC_conceal( psDec, psDecCtrl, frame, arch );
+        silk_PLC_conceal( psDec, psDecCtrl, frame,
+#ifdef ENABLE_DEEP_PLC
+            lpcnet,
+#endif
+            arch );
 
         psDec->lossCnt++;
     } else {
@@ -88,6 +102,14 @@ void silk_PLC(
         /* Update state             */
         /****************************/
         silk_PLC_update( psDec, psDecCtrl );
+#ifdef ENABLE_DEEP_PLC
+        if ( lpcnet != NULL && psDec->sPLC.fs_kHz == 16 ) {
+            int k;
+            for( k = 0; k < psDec->nb_subfr; k += 2 ) {
+                lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length );
+            }
+        }
+#endif
     }
 }
 
@@ -195,6 +217,9 @@ static OPUS_INLINE void silk_PLC_conceal(
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* O LPC residual signal    */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I Run-time architecture  */
 )
 {
@@ -371,6 +396,24 @@ static OPUS_INLINE void silk_PLC_conceal(
         /* Scale with Gain */
         frame[ i ] = (opus_int16)silk_SAT16( silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( sLPC_Q14_ptr[ MAX_LPC_ORDER + i ], prevGain_Q10[ 1 ] ), 8 ) ) );
     }
+#ifdef ENABLE_DEEP_PLC
+    if ( lpcnet != NULL && lpcnet->loaded && psDec->sPLC.fs_kHz == 16 ) {
+        int run_deep_plc = psDec->sPLC.enable_deep_plc || lpcnet->fec_fill_pos != 0;
+        if( run_deep_plc ) {
+            for( k = 0; k < psDec->nb_subfr; k += 2 ) {
+                lpcnet_plc_conceal( lpcnet, frame + k * psDec->subfr_length );
+            }
+            /* We *should* be able to copy only from psDec->frame_length-MAX_LPC_ORDER, i.e. the last MAX_LPC_ORDER samples. */
+            for( i = 0; i < psDec->frame_length; i++ ) {
+                sLPC_Q14_ptr[ MAX_LPC_ORDER + i ] = (int)floor(.5 + frame[ i ] * (float)(1 << 24) / prevGain_Q10[ 1 ] );
+            }
+        } else {
+          for( k = 0; k < psDec->nb_subfr; k += 2 ) {
+              lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length );
+          }
+        }
+    }
+#endif
 
     /* Save LPC state */
     silk_memcpy( psDec->sLPC_Q14_buf, &sLPC_Q14_ptr[ psDec->frame_length ], MAX_LPC_ORDER * sizeof( opus_int32 ) );
@@ -431,12 +474,16 @@ void silk_PLC_glue_frames(
                 slope_Q16 = silk_DIV32_16( ( (opus_int32)1 << 16 ) - gain_Q16, length );
                 /* Make slope 4x steeper to avoid missing onsets after DTX */
                 slope_Q16 = silk_LSHIFT( slope_Q16, 2 );
-
-                for( i = 0; i < length; i++ ) {
-                    frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] );
-                    gain_Q16 += slope_Q16;
-                    if( gain_Q16 > (opus_int32)1 << 16 ) {
-                        break;
+#ifdef ENABLE_DEEP_PLC
+                if ( psDec->sPLC.fs_kHz != 16 )
+#endif
+                {
+                    for( i = 0; i < length; i++ ) {
+                        frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] );
+                        gain_Q16 += slope_Q16;
+                        if( gain_Q16 > (opus_int32)1 << 16 ) {
+                            break;
+                        }
                     }
                 }
             }
diff --git a/media/libopus/silk/PLC.h b/media/libopus/silk/PLC.h
index 6438f51633..1bebb78638 100644
--- a/media/libopus/silk/PLC.h
+++ b/media/libopus/silk/PLC.h
@@ -49,6 +49,9 @@ void silk_PLC(
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* I/O  signal              */
     opus_int                            lost,               /* I Loss flag              */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I Run-time architecture  */
 );
 
diff --git a/media/libopus/silk/arm/NSQ_del_dec_arm.h b/media/libopus/silk/arm/NSQ_del_dec_arm.h
index 9e76e16927..0c4fcfccb4 100644
--- a/media/libopus/silk/arm/NSQ_del_dec_arm.h
+++ b/media/libopus/silk/arm/NSQ_del_dec_arm.h
@@ -34,7 +34,7 @@ POSSIBILITY OF SUCH DAMAGE.
 void silk_NSQ_del_dec_neon(
     const silk_encoder_state *psEncC, silk_nsq_state *NSQ,
     SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[],
-    const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],
+    const opus_int16 *PredCoef_Q12,
     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],
     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],
     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],
@@ -65,7 +65,7 @@ void silk_NSQ_del_dec_neon(
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state *psEncC, silk_nsq_state *NSQ,
     SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[],
-    const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],
+    const opus_int16 *PredCoef_Q12,
     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],
     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],
     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],
diff --git a/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c b/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c
index 212410f362..668dde6dc4 100644
--- a/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c
+++ b/media/libopus/silk/arm/NSQ_del_dec_neon_intr.c
@@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #endif
 #include "main.h"
 #include "stack_alloc.h"
+#include "os_support.h"
 
 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
 /* If there are more states, C function is called, and this optimization must be expanded. */
@@ -220,7 +221,7 @@ void silk_NSQ_del_dec_neon(
     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                      /* I    Input                           */
     opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                              /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
@@ -279,6 +280,7 @@ void silk_NSQ_del_dec_neon(
 
         /* Initialize delayed decision states */
         ALLOC( psDelDec, 1, NSQ_del_decs_struct );
+        OPUS_CLEAR(psDelDec, 1);
         /* Only RandState and RD_Q10 need to be initialized to 0. */
         silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
         vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
@@ -587,6 +589,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
     silk_assert( nStatesDelayedDecision > 0 );
     silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
     ALLOC( psSampleState, 2, NSQ_samples_struct );
+    OPUS_CLEAR(psSampleState, 2);
 
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -711,23 +714,26 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
                 const int rdo_offset = Lambda_Q10/2 - 512;
                 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
                 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
+                int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
+                signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
                 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
                 silk_assert( Lambda_Q10 <= 32767 );
 
                 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
-                q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
-                q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
+                q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
                 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
             }
             {
                 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
                 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
                 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
-                int16x4_t tmp1_s16x4, tmp2_s16x4;
+                int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
 
                 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
-                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
-                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
+                tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
+                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
+                tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
+                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4,  tmp_summand_s16x4);
                 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
                 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
                 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
@@ -818,6 +824,13 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
             }
         }
 
+        /* clear unused part of RD_Q10 to avoid overflows */
+        if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
+        {
+            OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
+            OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
+        }
+
         /* Increase RD values of expired states */
         {
             uint32x4_t t_u32x4;
@@ -896,7 +909,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
         vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
         vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
         tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
-        tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );
+        tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32(
+            vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) );
         vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
         vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
         vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
diff --git a/media/libopus/silk/arm/NSQ_neon.h b/media/libopus/silk/arm/NSQ_neon.h
index b31d9442d6..f03d8ddd98 100644
--- a/media/libopus/silk/arm/NSQ_neon.h
+++ b/media/libopus/silk/arm/NSQ_neon.h
@@ -73,7 +73,7 @@ static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon(opus_int32 *
 #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 
 #define silk_short_prediction_create_arch_coef(out, in, order) \
-    do { if (arch == OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0)
+    do { if (arch >= OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0)
 
 #endif
 
@@ -95,7 +95,7 @@ opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0, opus
    (coef vs. coefRev) so can't use the usual IMPL table implementation */
 #undef silk_noise_shape_quantizer_short_prediction
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch)  \
-    (arch == OPUS_ARCH_ARM_NEON ? \
+    (arch >= OPUS_ARCH_ARM_NEON ? \
         silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) : \
         silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
 
diff --git a/media/libopus/silk/arm/arm_silk_map.c b/media/libopus/silk/arm/arm_silk_map.c
index 0b9bfec2ca..a91f79b59f 100644
--- a/media/libopus/silk/arm/arm_silk_map.c
+++ b/media/libopus/silk/arm/arm_silk_map.c
@@ -49,6 +49,7 @@ void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])(
       silk_biquad_alt_stride2_c,    /* EDSP */
       silk_biquad_alt_stride2_c,    /* Media */
       silk_biquad_alt_stride2_neon, /* Neon */
+      silk_biquad_alt_stride2_neon, /* dotprod */
 };
 
 opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */
@@ -59,6 +60,7 @@ opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   R
       silk_LPC_inverse_pred_gain_c,    /* EDSP */
       silk_LPC_inverse_pred_gain_c,    /* Media */
       silk_LPC_inverse_pred_gain_neon, /* Neon */
+      silk_LPC_inverse_pred_gain_neon, /* dotprod */
 };
 
 void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
@@ -67,7 +69,7 @@ void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
         SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
         const opus_int16            x16[],                                      /* I    Input                           */
         opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-        const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+        const opus_int16            *PredCoef_Q12,                              /* I    Short term prediction coefs     */
         const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
         const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
         const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
@@ -82,6 +84,7 @@ void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
       silk_NSQ_del_dec_c,    /* EDSP */
       silk_NSQ_del_dec_c,    /* Media */
       silk_NSQ_del_dec_neon, /* Neon */
+      silk_NSQ_del_dec_neon, /* dotprod */
 };
 
 /*There is no table for silk_noise_shape_quantizer_short_prediction because the
@@ -97,6 +100,7 @@ opus_int32
   silk_NSQ_noise_shape_feedback_loop_c,    /* EDSP */
   silk_NSQ_noise_shape_feedback_loop_c,    /* Media */
   silk_NSQ_noise_shape_feedback_loop_neon, /* NEON */
+  silk_NSQ_noise_shape_feedback_loop_neon, /* dotprod */
 };
 
 # endif
@@ -116,6 +120,7 @@ void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
       silk_warped_autocorrelation_FIX_c,    /* EDSP */
       silk_warped_autocorrelation_FIX_c,    /* Media */
       silk_warped_autocorrelation_FIX_neon, /* Neon */
+      silk_warped_autocorrelation_FIX_neon, /* dotprod */
 };
 
 # endif
diff --git a/media/libopus/silk/control.h b/media/libopus/silk/control.h
index b76ec33cd6..f5633e624e 100644
--- a/media/libopus/silk/control.h
+++ b/media/libopus/silk/control.h
@@ -77,6 +77,9 @@ typedef struct {
     /* I:   Flag to enable in-band Forward Error Correction (FEC); 0/1                      */
     opus_int useInBandFEC;
 
+    /* I:   Flag to enable in-band Deep REDundancy (DRED); 0/1                              */
+    opus_int useDRED;
+
     /* I:   Flag to actually code in-band Forward Error Correction (FEC) in the current packet; 0/1 */
     opus_int LBRR_coded;
 
@@ -141,6 +144,14 @@ typedef struct {
 
     /* O:   Pitch lag of previous frame (0 if unvoiced), measured in samples at 48 kHz      */
     opus_int prevPitchLag;
+
+    /* I:   Enable Deep PLC                                                                 */
+    opus_int enable_deep_plc;
+
+#ifdef ENABLE_OSCE
+    /* I: OSCE method */
+    opus_int osce_method;
+#endif
 } silk_DecControlStruct;
 
 #ifdef __cplusplus
diff --git a/media/libopus/silk/dec_API.c b/media/libopus/silk/dec_API.c
index 7d5ca7fb9f..c1091d13ed 100644
--- a/media/libopus/silk/dec_API.c
+++ b/media/libopus/silk/dec_API.c
@@ -33,6 +33,11 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "stack_alloc.h"
 #include "os_support.h"
 
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#include "osce_structs.h"
+#endif
+
 /************************/
 /* Decoder Super Struct */
 /************************/
@@ -42,12 +47,33 @@ typedef struct {
     opus_int                         nChannelsAPI;
     opus_int                         nChannelsInternal;
     opus_int                         prev_decode_only_middle;
+#ifdef ENABLE_OSCE
+    OSCEModel                        osce_model;
+#endif
 } silk_decoder;
 
 /*********************/
 /* Decoder functions */
 /*********************/
 
+
+
+opus_int silk_LoadOSCEModels(void *decState, const unsigned char *data, int len)
+{
+#ifdef ENABLE_OSCE
+    opus_int ret = SILK_NO_ERROR;
+
+    ret = osce_load_models(&((silk_decoder *)decState)->osce_model, data, len);
+    ((silk_decoder *)decState)->osce_model.loaded = (ret == 0);
+    return ret;
+#else
+    (void) decState;
+    (void) data;
+    (void) len;
+    return SILK_NO_ERROR;
+#endif
+}
+
 opus_int silk_Get_Decoder_Size(                         /* O    Returns error code                              */
     opus_int                        *decSizeBytes       /* O    Number of bytes in SILK decoder state           */
 )
@@ -60,12 +86,37 @@ opus_int silk_Get_Decoder_Size(                         /* O    Returns error co
 }
 
 /* Reset decoder state */
+opus_int silk_ResetDecoder(                              /* O    Returns error code                              */
+    void                            *decState           /* I/O  State                                           */
+)
+{
+    opus_int n, ret = SILK_NO_ERROR;
+    silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state;
+
+    for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) {
+        ret  = silk_reset_decoder( &channel_state[ n ] );
+    }
+    silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo));
+    /* Not strictly needed, but it's cleaner that way */
+    ((silk_decoder *)decState)->prev_decode_only_middle = 0;
+
+    return ret;
+}
+
+
 opus_int silk_InitDecoder(                              /* O    Returns error code                              */
     void                            *decState           /* I/O  State                                           */
 )
 {
     opus_int n, ret = SILK_NO_ERROR;
     silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state;
+#ifdef ENABLE_OSCE
+    ((silk_decoder *)decState)->osce_model.loaded = 0;
+#endif
+#ifndef USE_WEIGHTS_FILE
+    /* load osce models */
+    silk_LoadOSCEModels(decState, NULL, 0);
+#endif
 
     for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) {
         ret  = silk_init_decoder( &channel_state[ n ] );
@@ -86,6 +137,9 @@ opus_int silk_Decode(                                   /* O    Returns error co
     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
     opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                  *lpcnet,
+#endif
     int                             arch                /* I    Run-time architecture                           */
 )
 {
@@ -278,6 +332,7 @@ opus_int silk_Decode(                                   /* O    Returns error co
         has_side = !psDec->prev_decode_only_middle
               || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 );
     }
+    channel_state[ 0 ].sPLC.enable_deep_plc = decControl->enable_deep_plc;
     /* Call decoder for one frame */
     for( n = 0; n < decControl->nChannelsInternal; n++ ) {
         if( n == 0 || has_side ) {
@@ -297,7 +352,19 @@ opus_int silk_Decode(                                   /* O    Returns error co
             } else {
                 condCoding = CODE_CONDITIONALLY;
             }
-            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch);
+#ifdef ENABLE_OSCE
+            if ( channel_state[n].osce.method != decControl->osce_method ) {
+                osce_reset( &channel_state[n].osce, decControl->osce_method );
+            }
+#endif
+            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding,
+#ifdef ENABLE_DEEP_PLC
+                n == 0 ? lpcnet : NULL,
+#endif
+#ifdef ENABLE_OSCE
+                &psDec->osce_model,
+#endif
+                arch);
         } else {
             silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
         }
diff --git a/media/libopus/silk/decode_frame.c b/media/libopus/silk/decode_frame.c
index 4f36f854c2..9bc4ca2b0e 100644
--- a/media/libopus/silk/decode_frame.c
+++ b/media/libopus/silk/decode_frame.c
@@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "stack_alloc.h"
 #include "PLC.h"
 
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#endif
+
 /****************/
 /* Decode frame */
 /****************/
@@ -43,6 +47,12 @@ opus_int silk_decode_frame(
     opus_int32                  *pN,                            /* O    Pointer to size of output frame             */
     opus_int                    lostFlag,                       /* I    0: no loss, 1 loss, 2 decode fec            */
     opus_int                    condCoding,                     /* I    The type of conditional coding to use       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState              *lpcnet,
+#endif
+#ifdef ENABLE_OSCE
+    OSCEModel                   *osce_model,
+#endif
     int                         arch                            /* I    Run-time architecture                       */
 )
 {
@@ -61,6 +71,10 @@ opus_int silk_decode_frame(
         ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) )
     {
         VARDECL( opus_int16, pulses );
+#ifdef ENABLE_OSCE
+        opus_int32  ec_start;
+        ec_start = ec_tell(psRangeDec);
+#endif
         ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) &
                        ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 );
         /*********************************************/
@@ -84,10 +98,29 @@ opus_int silk_decode_frame(
         /********************************************************/
         silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch );
 
+        /*************************/
+        /* Update output buffer. */
+        /*************************/
+        celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
+        mv_len = psDec->ltp_mem_length - psDec->frame_length;
+        silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
+        silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
+
+#ifdef ENABLE_OSCE
+        /********************************************************/
+        /* Run SILK enhancer                                    */
+        /********************************************************/
+        osce_enhance_frame( osce_model, psDec, psDecCtrl, pOut, ec_tell(psRangeDec) - ec_start, arch );
+#endif
+
         /********************************************************/
         /* Update PLC state                                     */
         /********************************************************/
-        silk_PLC( psDec, psDecCtrl, pOut, 0, arch );
+        silk_PLC( psDec, psDecCtrl, pOut, 0,
+#ifdef ENABLE_DEEP_PLC
+            lpcnet,
+#endif
+            arch );
 
         psDec->lossCnt = 0;
         psDec->prevSignalType = psDec->indices.signalType;
@@ -97,16 +130,23 @@ opus_int silk_decode_frame(
         psDec->first_frame_after_reset = 0;
     } else {
         /* Handle packet loss by extrapolation */
-        silk_PLC( psDec, psDecCtrl, pOut, 1, arch );
-    }
+        silk_PLC( psDec, psDecCtrl, pOut, 1,
+#ifdef ENABLE_DEEP_PLC
+            lpcnet,
+#endif
+            arch );
 
-    /*************************/
-    /* Update output buffer. */
-    /*************************/
-    celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
-    mv_len = psDec->ltp_mem_length - psDec->frame_length;
-    silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
-    silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
+#ifdef ENABLE_OSCE
+        osce_reset( &psDec->osce, psDec->osce.method );
+#endif
+        /*************************/
+        /* Update output buffer. */
+        /*************************/
+        celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
+        mv_len = psDec->ltp_mem_length - psDec->frame_length;
+        silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
+        silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
+    }
 
     /************************************************/
     /* Comfort noise generation / estimation        */
diff --git a/media/libopus/silk/enc_API.c b/media/libopus/silk/enc_API.c
index 548e07364d..369caddd98 100644
--- a/media/libopus/silk/enc_API.c
+++ b/media/libopus/silk/enc_API.c
@@ -41,6 +41,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "main_FLP.h"
 #endif
 
+#ifdef ENABLE_DRED
+#include "dred_encoder.h"
+#endif
+
 /***************************************/
 /* Read control structure from encoder */
 /***************************************/
diff --git a/media/libopus/silk/fixed/encode_frame_FIX.c b/media/libopus/silk/fixed/encode_frame_FIX.c
index a02bf87dbb..7c83360ba3 100644
--- a/media/libopus/silk/fixed/encode_frame_FIX.c
+++ b/media/libopus/silk/fixed/encode_frame_FIX.c
@@ -105,8 +105,11 @@ opus_int silk_encode_frame_FIX(
     opus_int     gain_lock[ MAX_NB_SUBFR ] = {0};
     opus_int16   best_gain_mult[ MAX_NB_SUBFR ];
     opus_int     best_sum[ MAX_NB_SUBFR ];
+    opus_int     bits_margin;
     SAVE_STACK;
 
+    /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */
+    bits_margin = useCBR ? 5 : maxBits/4;
     /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */
     LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0;
 
@@ -282,7 +285,7 @@ opus_int silk_encode_frame_FIX(
                     gainMult_upper = gainMult_Q8;
                     gainsID_upper = gainsID;
                 }
-            } else if( nBits < maxBits - 5 ) {
+            } else if( nBits < maxBits - bits_margin ) {
                 found_lower = 1;
                 nBits_lower = nBits;
                 gainMult_lower = gainMult_Q8;
@@ -296,7 +299,7 @@ opus_int silk_encode_frame_FIX(
                     LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
                 }
             } else {
-                /* Within 5 bits of budget: close enough */
+                /* Close enough */
                 break;
             }
 
@@ -318,17 +321,10 @@ opus_int silk_encode_frame_FIX(
             if( ( found_lower & found_upper ) == 0 ) {
                 /* Adjust gain according to high-rate rate/distortion curve */
                 if( nBits > maxBits ) {
-                    if (gainMult_Q8 < 16384) {
-                        gainMult_Q8 *= 2;
-                    } else {
-                        gainMult_Q8 = 32767;
-                    }
+                    gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 );
                 } else {
-                    opus_int32 gain_factor_Q16;
-                    gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) );
-                    gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 );
+                    gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 );
                 }
-
             } else {
                 /* Adjust gain by interpolating */
                 gainMult_Q8 = gainMult_lower + silk_DIV32_16( silk_MUL( gainMult_upper - gainMult_lower, maxBits - nBits_lower ), nBits_upper - nBits_lower );
diff --git a/media/libopus/silk/float/SigProc_FLP.h b/media/libopus/silk/float/SigProc_FLP.h
index 953de8b09e..ff9281b852 100644
--- a/media/libopus/silk/float/SigProc_FLP.h
+++ b/media/libopus/silk/float/SigProc_FLP.h
@@ -30,6 +30,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 #include "float_cast.h"
+#include "main.h"
 #include <math.h>
 
 #ifdef  __cplusplus
@@ -73,7 +74,8 @@ void silk_autocorrelation_FLP(
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 );
 
 opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced, 1 unvoiced                      */
@@ -105,7 +107,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 );
 
 /* multiply a vector by a constant */
@@ -124,12 +127,17 @@ void silk_scale_copy_vector_FLP(
 );
 
 /* inner product of two silk_float arrays, with result as double */
-double silk_inner_product_FLP(
+double silk_inner_product_FLP_c(
     const silk_float    *data1,
     const silk_float    *data2,
     opus_int            dataSize
 );
 
+#ifndef OVERRIDE_inner_product_FLP
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize))
+#endif
+
+
 /* sum of squares of a silk_float array, with result as double */
 double silk_energy_FLP(
     const silk_float    *data,
diff --git a/media/libopus/silk/float/autocorrelation_FLP.c b/media/libopus/silk/float/autocorrelation_FLP.c
index 8b8a9e659a..4253b26ebc 100644
--- a/media/libopus/silk/float/autocorrelation_FLP.c
+++ b/media/libopus/silk/float/autocorrelation_FLP.c
@@ -37,7 +37,8 @@ void silk_autocorrelation_FLP(
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 )
 {
     opus_int i;
@@ -47,6 +48,6 @@ void silk_autocorrelation_FLP(
     }
 
     for( i = 0; i < correlationCount; i++ ) {
-        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i );
+        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch );
     }
 }
diff --git a/media/libopus/silk/float/burg_modified_FLP.c b/media/libopus/silk/float/burg_modified_FLP.c
index 756b76a35b..f5bef5ddbe 100644
--- a/media/libopus/silk/float/burg_modified_FLP.c
+++ b/media/libopus/silk/float/burg_modified_FLP.c
@@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 )
 {
     opus_int         k, n, s, reached_max_gain;
@@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     for( s = 0; s < nb_subfr; s++ ) {
         x_ptr = x + s * subfr_length;
         for( n = 1; n < D + 1; n++ ) {
-            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch );
         }
     }
     silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
diff --git a/media/libopus/silk/float/corrMatrix_FLP.c b/media/libopus/silk/float/corrMatrix_FLP.c
index eae6a1cfca..eef6e8aa79 100644
--- a/media/libopus/silk/float/corrMatrix_FLP.c
+++ b/media/libopus/silk/float/corrMatrix_FLP.c
@@ -41,7 +41,8 @@ void silk_corrVector_FLP(
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 )
 {
     opus_int lag;
@@ -50,7 +51,7 @@ void silk_corrVector_FLP(
     ptr1 = &x[ Order - 1 ];                     /* Points to first sample of column 0 of X: X[:,0] */
     for( lag = 0; lag < Order; lag++ ) {
         /* Calculate X[:,lag]'*t */
-        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L );
+        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch );
         ptr1--;                                 /* Next column of X */
     }
 }
@@ -60,7 +61,8 @@ void silk_corrMatrix_FLP(
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 )
 {
     opus_int j, lag;
@@ -79,7 +81,7 @@ void silk_corrMatrix_FLP(
     ptr2 = &x[ Order - 2 ];                     /* First sample of column 1 of X */
     for( lag = 1; lag < Order; lag++ ) {
         /* Calculate X[:,0]'*X[:,lag] */
-        energy = silk_inner_product_FLP( ptr1, ptr2, L );
+        energy = silk_inner_product_FLP( ptr1, ptr2, L, arch );
         matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy;
         matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy;
         /* Calculate X[:,j]'*X[:,j + lag] */
diff --git a/media/libopus/silk/float/encode_frame_FLP.c b/media/libopus/silk/float/encode_frame_FLP.c
index b029c3f5ca..8a327c5626 100644
--- a/media/libopus/silk/float/encode_frame_FLP.c
+++ b/media/libopus/silk/float/encode_frame_FLP.c
@@ -107,7 +107,10 @@ opus_int silk_encode_frame_FLP(
     opus_int     gain_lock[ MAX_NB_SUBFR ] = {0};
     opus_int16   best_gain_mult[ MAX_NB_SUBFR ];
     opus_int     best_sum[ MAX_NB_SUBFR ];
+    opus_int     bits_margin;
 
+    /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */
+    bits_margin = useCBR ? 5 : maxBits/4;
     /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */
     LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0;
 
@@ -270,7 +273,7 @@ opus_int silk_encode_frame_FLP(
                     gainMult_upper = gainMult_Q8;
                     gainsID_upper = gainsID;
                 }
-            } else if( nBits < maxBits - 5 ) {
+            } else if( nBits < maxBits - bits_margin ) {
                 found_lower = 1;
                 nBits_lower = nBits;
                 gainMult_lower = gainMult_Q8;
@@ -284,7 +287,7 @@ opus_int silk_encode_frame_FLP(
                     LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
                 }
             } else {
-                /* Within 5 bits of budget: close enough */
+                /* Close enough */
                 break;
             }
 
@@ -306,15 +309,9 @@ opus_int silk_encode_frame_FLP(
             if( ( found_lower & found_upper ) == 0 ) {
                 /* Adjust gain according to high-rate rate/distortion curve */
                 if( nBits > maxBits ) {
-                    if (gainMult_Q8 < 16384) {
-                        gainMult_Q8 *= 2;
-                    } else {
-                        gainMult_Q8 = 32767;
-                    }
+                    gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 );
                 } else {
-                    opus_int32 gain_factor_Q16;
-                    gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) );
-                    gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 );
+                    gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 );
                 }
             } else {
                 /* Adjust gain by interpolating */
diff --git a/media/libopus/silk/float/find_LPC_FLP.c b/media/libopus/silk/float/find_LPC_FLP.c
index fa3ffe7f8b..6ccd711dc3 100644
--- a/media/libopus/silk/float/find_LPC_FLP.c
+++ b/media/libopus/silk/float/find_LPC_FLP.c
@@ -38,7 +38,8 @@ void silk_find_LPC_FLP(
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Inverse of max prediction gain              */
+    const silk_float                minInvGain,                         /* I    Inverse of max prediction gain              */
+    int                             arch
 )
 {
     opus_int    k, subfr_length;
@@ -56,12 +57,12 @@ void silk_find_LPC_FLP(
     psEncC->indices.NLSFInterpCoef_Q2 = 4;
 
     /* Burg AR analysis for the full frame */
-    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );
+    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch );
 
     if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {
         /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than        */
         /* adding it to the residual energy of the first 10 ms in each iteration of the search below    */
-        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder );
+        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch );
 
         /* Convert to NLSFs */
         silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder );
diff --git a/media/libopus/silk/float/find_LTP_FLP.c b/media/libopus/silk/float/find_LTP_FLP.c
index f97064930e..90aeeac0b7 100644
--- a/media/libopus/silk/float/find_LTP_FLP.c
+++ b/media/libopus/silk/float/find_LTP_FLP.c
@@ -38,7 +38,8 @@ void silk_find_LTP_FLP(
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[ MAX_NB_SUBFR ],                /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 )
 {
     opus_int   k;
@@ -50,8 +51,8 @@ void silk_find_LTP_FLP(
     XX_ptr = XX;
     for( k = 0; k < nb_subfr; k++ ) {
         lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 );
-        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr );
-        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr );
+        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch );
+        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch );
         xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER );
         temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f );
         silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER );
diff --git a/media/libopus/silk/float/find_pitch_lags_FLP.c b/media/libopus/silk/float/find_pitch_lags_FLP.c
index dedbcd2836..1f6bd5991c 100644
--- a/media/libopus/silk/float/find_pitch_lags_FLP.c
+++ b/media/libopus/silk/float/find_pitch_lags_FLP.c
@@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP(
     silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );
 
     /* Calculate autocorrelation sequence */
-    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );
+    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );
 
     /* Add white noise, as a fraction of the energy */
     auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1;
diff --git a/media/libopus/silk/float/find_pred_coefs_FLP.c b/media/libopus/silk/float/find_pred_coefs_FLP.c
index 6f79078893..f3c54cf474 100644
--- a/media/libopus/silk/float/find_pred_coefs_FLP.c
+++ b/media/libopus/silk/float/find_pred_coefs_FLP.c
@@ -63,7 +63,7 @@ void silk_find_pred_coefs_FLP(
         celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );
 
         /* LTP analysis */
-        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr );
+        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch );
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
@@ -102,7 +102,7 @@ void silk_find_pred_coefs_FLP(
     }
 
     /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
-    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain );
+    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch );
 
     /* Quantize LSFs */
     silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 );
diff --git a/media/libopus/silk/float/inner_product_FLP.c b/media/libopus/silk/float/inner_product_FLP.c
index cdd39d24ce..88b160ab40 100644
--- a/media/libopus/silk/float/inner_product_FLP.c
+++ b/media/libopus/silk/float/inner_product_FLP.c
@@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "SigProc_FLP.h"
 
 /* inner product of two silk_float arrays, with result as double */
-double silk_inner_product_FLP(
+double silk_inner_product_FLP_c(
     const silk_float    *data1,
     const silk_float    *data2,
     opus_int            dataSize
diff --git a/media/libopus/silk/float/main_FLP.h b/media/libopus/silk/float/main_FLP.h
index 5dc0ccf4a4..2e4435cc68 100644
--- a/media/libopus/silk/float/main_FLP.h
+++ b/media/libopus/silk/float/main_FLP.h
@@ -138,7 +138,8 @@ void silk_find_LPC_FLP(
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Prediction gain from LTP (dB)               */
+    const silk_float                minInvGain,                         /* I    Prediction gain from LTP (dB)               */
+    int                             arch
 );
 
 /* LTP analysis */
@@ -148,7 +149,8 @@ void silk_find_LTP_FLP(
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[  MAX_NB_SUBFR ],               /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 );
 
 void silk_LTP_analysis_filter_FLP(
@@ -221,7 +223,8 @@ void silk_corrMatrix_FLP(
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 );
 
 /* Calculates correlation vector X'*t */
@@ -230,7 +233,8 @@ void silk_corrVector_FLP(
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 );
 
 /* Apply sine window to signal vector.  */
diff --git a/media/libopus/silk/float/noise_shape_analysis_FLP.c b/media/libopus/silk/float/noise_shape_analysis_FLP.c
index cb3d8a50b7..0b5ea95218 100644
--- a/media/libopus/silk/float/noise_shape_analysis_FLP.c
+++ b/media/libopus/silk/float/noise_shape_analysis_FLP.c
@@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP(
                 psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
         } else {
             /* Calculate regular auto correlation */
-            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );
+            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch );
         }
 
         /* Add white noise, as a fraction of energy */
diff --git a/media/libopus/silk/float/pitch_analysis_core_FLP.c b/media/libopus/silk/float/pitch_analysis_core_FLP.c
index f351bc3718..0530a8831a 100644
--- a/media/libopus/silk/float/pitch_analysis_core_FLP.c
+++ b/media/libopus/silk/float/pitch_analysis_core_FLP.c
@@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
-            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz );
+            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch );
             if( cross_corr > 0.0f ) {
                 energy = silk_energy_FLP( basis_ptr, sf_length_8kHz );
                 C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) );
diff --git a/media/libopus/silk/float/warped_autocorrelation_FLP.c b/media/libopus/silk/float/warped_autocorrelation_FLP.c
index 09186e73d4..116dab923f 100644
--- a/media/libopus/silk/float/warped_autocorrelation_FLP.c
+++ b/media/libopus/silk/float/warped_autocorrelation_FLP.c
@@ -54,11 +54,13 @@ void silk_warped_autocorrelation_FLP(
         /* Loop over allpass sections */
         for( i = 0; i < order; i += 2 ) {
             /* Output of allpass section */
-            tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 );
+            /* We voluntarily use two multiples instead of factoring the expression to
+               reduce the length of the dependency chain (tmp1->tmp2->tmp1... ). */
+            tmp2 = state[ i ] + warping * state[ i + 1 ] - warping * tmp1;
             state[ i ] = tmp1;
             C[ i ] += state[ 0 ] * tmp1;
             /* Output of allpass section */
-            tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 );
+            tmp1 = state[ i + 1 ] + warping * state[ i + 2 ] - warping * tmp2;
             state[ i + 1 ] = tmp2;
             C[ i + 1 ] += state[ 0 ] * tmp2;
         }
diff --git a/media/libopus/silk/float/x86/inner_product_FLP_avx2.c b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c
new file mode 100644
index 0000000000..4a2daaf595
--- /dev/null
+++ b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c
@@ -0,0 +1,85 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+              2023 Amazon
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "SigProc_FLP.h"
+#include <immintrin.h>
+
+
+/* inner product of two silk_float arrays, with result as double */
+double silk_inner_product_FLP_avx2(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+)
+{
+    opus_int i;
+    __m256d accum1, accum2;
+    double   result;
+
+    /* 4x unrolled loop */
+    result = 0.0;
+    accum1 = accum2 = _mm256_setzero_pd();
+    for( i = 0; i < dataSize - 7; i += 8 ) {
+        __m128  x1f, x2f;
+        __m256d x1d, x2d;
+        x1f = _mm_loadu_ps( &data1[ i ] );
+        x2f = _mm_loadu_ps( &data2[ i ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 );
+        x1f = _mm_loadu_ps( &data1[ i + 4 ] );
+        x2f = _mm_loadu_ps( &data2[ i + 4 ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum2 = _mm256_fmadd_pd( x1d, x2d, accum2 );
+    }
+    for( ; i < dataSize - 3; i += 4 ) {
+        __m128  x1f, x2f;
+        __m256d x1d, x2d;
+        x1f = _mm_loadu_ps( &data1[ i ] );
+        x2f = _mm_loadu_ps( &data2[ i ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 );
+    }
+    accum1 = _mm256_add_pd(accum1, accum2);
+    accum1 = _mm256_add_pd(accum1, _mm256_permute2f128_pd(accum1, accum1, 1));
+    accum1 = _mm256_hadd_pd(accum1,accum1);
+    result = _mm256_cvtsd_f64(accum1);
+
+    /* add any remaining products */
+    for( ; i < dataSize; i++ ) {
+        result += data1[ i ] * (double)data2[ i ];
+    }
+
+    return result;
+}
diff --git a/media/libopus/silk/init_decoder.c b/media/libopus/silk/init_decoder.c
index 16c03dcd1c..01bc4b7a12 100644
--- a/media/libopus/silk/init_decoder.c
+++ b/media/libopus/silk/init_decoder.c
@@ -31,15 +31,21 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main.h"
 
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#endif
+
+#include "structs.h"
+
 /************************/
-/* Init Decoder State   */
+/* Reset Decoder State  */
 /************************/
-opus_int silk_init_decoder(
+opus_int silk_reset_decoder(
     silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
 )
 {
     /* Clear the entire encoder state, except anything copied */
-    silk_memset( psDec, 0, sizeof( silk_decoder_state ) );
+    silk_memset( &psDec->SILK_DECODER_STATE_RESET_START, 0, sizeof( silk_decoder_state ) - ((char*) &psDec->SILK_DECODER_STATE_RESET_START - (char*)psDec) );
 
     /* Used to deactivate LSF interpolation */
     psDec->first_frame_after_reset = 1;
@@ -52,6 +58,27 @@ opus_int silk_init_decoder(
     /* Reset PLC state */
     silk_PLC_Reset( psDec );
 
+#ifdef ENABLE_OSCE
+    /* Reset OSCE state and method */
+    osce_reset(&psDec->osce, OSCE_DEFAULT_METHOD);
+#endif
+
+    return 0;
+}
+
+
+/************************/
+/* Init Decoder State   */
+/************************/
+opus_int silk_init_decoder(
+    silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
+)
+{
+    /* Clear the entire encoder state, except anything copied */
+    silk_memset( psDec, 0, sizeof( silk_decoder_state ) );
+
+    silk_reset_decoder( psDec );
+
     return(0);
 }
 
diff --git a/media/libopus/silk/init_encoder.c b/media/libopus/silk/init_encoder.c
index 65995c33fa..10d41287fe 100644
--- a/media/libopus/silk/init_encoder.c
+++ b/media/libopus/silk/init_encoder.c
@@ -36,6 +36,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "tuning_parameters.h"
 #include "cpu_support.h"
 
+#ifdef ENABLE_DRED
+#include "dred_encoder.h"
+#endif
+
 /*********************************/
 /* Initialize Silk Encoder state */
 /*********************************/
diff --git a/media/libopus/silk/main.h b/media/libopus/silk/main.h
index a5f568758f..cd576d8cc1 100644
--- a/media/libopus/silk/main.h
+++ b/media/libopus/silk/main.h
@@ -252,7 +252,7 @@ void silk_NSQ_c(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -278,7 +278,7 @@ void silk_NSQ_del_dec_c(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -389,6 +389,10 @@ void silk_NLSF_decode(
 /****************************************************/
 /* Decoder Functions                                */
 /****************************************************/
+opus_int silk_reset_decoder(
+    silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
+);
+
 opus_int silk_init_decoder(
     silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
 );
@@ -410,6 +414,12 @@ opus_int silk_decode_frame(
     opus_int32                  *pN,                            /* O    Pointer to size of output frame             */
     opus_int                    lostFlag,                       /* I    0: no loss, 1 loss, 2 decode fec            */
     opus_int                    condCoding,                     /* I    The type of conditional coding to use       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState              *lpcnet,
+#endif
+#ifdef ENABLE_OSCE
+    OSCEModel                   *osce_model,
+#endif
     int                         arch                            /* I    Run-time architecture                       */
 );
 
diff --git a/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h b/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h
index cd70713a8f..85bfb637ef 100644
--- a/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h
+++ b/media/libopus/silk/mips/NSQ_del_dec_mipsr1.h
@@ -25,8 +25,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 ***********************************************************************/
 
-#ifndef __NSQ_DEL_DEC_MIPSR1_H__
-#define __NSQ_DEL_DEC_MIPSR1_H__
+#ifndef NSQ_DEL_DEC_MIPSR1_H__
+#define NSQ_DEL_DEC_MIPSR1_H__
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -407,4 +407,4 @@ static inline void silk_noise_shape_quantizer_del_dec(
     }
 }
 
-#endif /* __NSQ_DEL_DEC_MIPSR1_H__ */
+#endif /* NSQ_DEL_DEC_MIPSR1_H__ */
diff --git a/media/libopus/silk/mips/macros_mipsr1.h b/media/libopus/silk/mips/macros_mipsr1.h
index 12ed981a6e..af408802c3 100644
--- a/media/libopus/silk/mips/macros_mipsr1.h
+++ b/media/libopus/silk/mips/macros_mipsr1.h
@@ -26,8 +26,8 @@ POSSIBILITY OF SUCH DAMAGE.
 ***********************************************************************/
 
 
-#ifndef __SILK_MACROS_MIPSR1_H__
-#define __SILK_MACROS_MIPSR1_H__
+#ifndef SILK_MACROS_MIPSR1_H__
+#define SILK_MACROS_MIPSR1_H__
 
 #define mips_clz(x) __builtin_clz(x)
 
@@ -89,4 +89,4 @@ static inline opus_int32 silk_CLZ32(opus_int32 in32)
     return re32;
 }
 
-#endif /* __SILK_MACROS_MIPSR1_H__ */
+#endif /* SILK_MACROS_MIPSR1_H__ */
diff --git a/media/libopus/silk/structs.h b/media/libopus/silk/structs.h
index 3380c757b2..38243be1ea 100644
--- a/media/libopus/silk/structs.h
+++ b/media/libopus/silk/structs.h
@@ -34,6 +34,21 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "entenc.h"
 #include "entdec.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+#endif
+
+#ifdef ENABLE_DRED
+#include "dred_encoder.h"
+#include "dred_decoder.h"
+#endif
+
+#ifdef ENABLE_OSCE
+#include "osce_config.h"
+#include "osce_structs.h"
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -228,6 +243,14 @@ typedef struct {
 } silk_encoder_state;
 
 
+#ifdef ENABLE_OSCE
+typedef struct {
+    OSCEFeatureState features;
+    OSCEState state;
+    int method;
+} silk_OSCE_struct;
+#endif
+
 /* Struct for Packet Loss Concealment */
 typedef struct {
     opus_int32                  pitchL_Q8;                          /* Pitch lag to use for voiced concealment                          */
@@ -243,6 +266,7 @@ typedef struct {
     opus_int                    fs_kHz;
     opus_int                    nb_subfr;
     opus_int                    subfr_length;
+    opus_int                    enable_deep_plc;
 } silk_PLC_struct;
 
 /* Struct for CNG */
@@ -259,6 +283,10 @@ typedef struct {
 /* Decoder state                */
 /********************************/
 typedef struct {
+#ifdef ENABLE_OSCE
+    silk_OSCE_struct            osce;
+#endif
+#define SILK_DECODER_STATE_RESET_START prev_gain_Q16
     opus_int32                  prev_gain_Q16;
     opus_int32                  exc_Q14[ MAX_FRAME_LENGTH ];
     opus_int32                  sLPC_Q14_buf[ MAX_LPC_ORDER ];
diff --git a/media/libopus/silk/x86/NSQ_del_dec_avx2.c b/media/libopus/silk/x86/NSQ_del_dec_avx2.c
new file mode 100644
index 0000000000..43485871a4
--- /dev/null
+++ b/media/libopus/silk/x86/NSQ_del_dec_avx2.c
@@ -0,0 +1,1075 @@
+/***********************************************************************
+Copyright (c) 2021 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef OPUS_CHECK_ASM
+#include <string.h>
+#endif
+
+#include "opus_defines.h"
+#include <immintrin.h>
+
+#include "main.h"
+#include "stack_alloc.h"
+#include "NSQ.h"
+#include "celt/x86/x86cpu.h"
+
+/* Returns TRUE if all assumptions met */
+static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
+{
+    /* This optimization is based on these assumptions        */
+    /* These assumptions are fundamental and hence assert are */
+    /* used. Should any assert triggers, we have to re-visit  */
+    /* all related code to make sure it still functions the   */
+    /* same as the C implementation.                          */
+    silk_assert(MAX_DEL_DEC_STATES  <= 4      &&
+                MAX_FRAME_LENGTH     % 4 == 0 &&
+                MAX_SUB_FRAME_LENGTH % 4 == 0 &&
+                LTP_MEM_LENGTH_MS    % 4 == 0 );
+    silk_assert(psEncC->fs_kHz ==  8 ||
+                psEncC->fs_kHz == 12 ||
+                psEncC->fs_kHz == 16 );
+    silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
+                psEncC->nb_subfr > 0             );
+    silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
+                psEncC->nStatesDelayedDecision > 0                   );
+    silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
+
+    /* Regressions were observed on certain AMD Zen CPUs when      */
+    /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */
+    /* these CPUs and enable this optimization on others; however, */
+    /* there is no good way to do so under current OPUS framework. */
+    return psEncC->nStatesDelayedDecision == 3 ||
+           psEncC->nStatesDelayedDecision == 4;
+}
+
+/* Intrinsics not defined on MSVC */
+#ifdef _MSC_VER
+#include <Intsafe.h>
+#define __m128i_u __m128i
+static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
+{
+    *res = a+b;
+    return (*res ^ a) & (*res ^ b) & 0x80000000;
+}
+static inline int __builtin_ctz(unsigned int x)
+{
+    DWORD res = 0;
+    return _BitScanForward(&res, x) ? res : 32;
+}
+#endif
+
+static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
+{
+    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
+}
+
+static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
+{
+    num = num > silk_int16_MAX ? silk_int16_MAX : num;
+    num = num < silk_int16_MIN ? silk_int16_MIN : num;
+    return num;
+}
+
+static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
+{
+    silk_assert(bits > 0 && bits < 31);
+    a += 1 << (bits-1);
+    return a >> bits;
+}
+
+static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
+{
+    silk_assert(bits > 0 && bits < 63);
+#ifdef OPUS_CHECK_ASM
+    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
+#else
+    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
+    silk_assert(bits > 0 && bits < 63);
+    opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
+    bits += 16;
+    t += 1ull << (bits-1);
+    return t >> bits;
+#endif
+}
+
+static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
+{
+    opus_int32 sum;
+    if (__builtin_sadd_overflow(a, b, &sum))
+    {
+        return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
+    }
+    return sum;
+}
+
+static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
+{
+    silk_assert(bits > 0 && bits < 31);
+    return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
+}
+
+/* add/subtract with output saturated */
+static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
+{
+    __m128i r = _mm_add_epi32(a, b);
+    __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */
+    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
+    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
+}
+static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
+{
+    __m128i r = _mm_sub_epi32(a, b);
+    __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
+    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
+    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
+}
+static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
+{
+    __m256i r = _mm256_sub_epi32(a, b);
+    __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
+    __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
+    return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
+}
+
+static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
+{
+    opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
+    opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
+
+    num = _mm_min_epi32(num, _mm_set1_epi32(hi));
+    num = _mm_max_epi32(num, _mm_set1_epi32(lo));
+    return num;
+}
+
+/* cond < 0 ? -num : num */
+static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
+{
+    return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
+}
+static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
+{
+    return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
+}
+
+/* (a32 * b32) >> 16 */
+static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
+{
+    return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
+}
+
+/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
+{
+    return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));
+}
+
+/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
+static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
+{
+    const char FF = (char)0xFF;
+    __m256i msk = _mm256_set_epi8(
+        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
+        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m256i lo = _mm256_mullo_epi16(a, b);
+    __m256i hi = _mm256_mulhi_epi16(a, b);
+    lo = _mm256_shuffle_epi8(lo, msk);
+    hi = _mm256_shuffle_epi8(hi, msk);
+    return _mm256_unpacklo_epi16(lo, hi);
+}
+
+static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
+{
+    v = _mm256_shuffle_epi32(v, 0x1B);
+    v = _mm256_permute4x64_epi64(v, 0x4E);
+    return v;
+}
+
+static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
+{
+    __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
+    return _mm_cvtsi128_si32(sum);
+}
+
+static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
+{
+    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
+    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
+    return num;
+}
+
+static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
+{
+    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
+    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
+    return num;
+}
+
+static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
+{
+    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
+    return silk_mm_hmin_epi32(num);
+}
+
+static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
+{
+    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
+    return silk_mm_hmax_epi32(num);
+}
+
+static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
+{
+    seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
+    seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
+    return seed;
+}
+
+static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
+{
+    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
+    silk_assert(mask != 0);
+    return __builtin_ctz(mask) >> 2;
+}
+
+static __m128i silk_index_to_selector(opus_int32 index)
+{
+    silk_assert(index < 4);
+    index <<= 2;
+    return _mm_set_epi8(
+        index + 3, index + 2, index + 1, index + 0,
+        index + 3, index + 2, index + 1, index + 0,
+        index + 3, index + 2, index + 1, index + 0,
+        index + 3, index + 2, index + 1, index + 0);
+}
+
+static opus_int32 silk_select_winner(__m128i num, __m128i selector)
+{
+    return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
+}
+
+typedef struct
+{
+    __m128i RandState;
+    __m128i Q_Q10;
+    __m128i Xq_Q14;
+    __m128i Pred_Q15;
+    __m128i Shape_Q14;
+} NSQ_del_dec_sample_struct;
+
+typedef struct
+{
+    __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
+    __m128i LF_AR_Q14;
+    __m128i Seed;
+    __m128i SeedInit;
+    __m128i RD_Q10;
+    __m128i Diff_Q14;
+    __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
+    NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
+} NSQ_del_dec_struct;
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
+    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
+    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
+    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
+    const opus_int16 x16[],                    /* I    Input                           */
+    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
+    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
+    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
+    opus_int subfr,                            /* I    Subframe number                 */
+    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
+    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
+    const opus_int signal_type,                /* I    Signal type                     */
+    const opus_int decisionDelay               /* I    Decision delay                  */
+);
+
+/*******************************************/
+/* LPC analysis filter                     */
+/* NB! State is kept internally and the    */
+/* filter always starts with zero state    */
+/* first d output samples are set to zero  */
+/*******************************************/
+static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
+    opus_int16                  *out,               /* O    Output signal                           */
+    const opus_int16            *in,                /* I    Input signal                            */
+    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
+    const opus_int32            len,                /* I    Signal length                           */
+    const opus_int32            order               /* I    Filter order                            */
+);
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
+    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
+    NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */
+    opus_int signalType,                        /* I    Signal type                        */
+    const opus_int32 x_Q10[],                   /* I                                       */
+    opus_int8 pulses[],                         /* O                                       */
+    opus_int16 xq[],                            /* O                                       */
+    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
+    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
+    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
+    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
+    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
+    opus_int lag,                               /* I    Pitch lag                          */
+    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
+    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
+    opus_int32 LF_shp_Q14,                      /* I                                       */
+    opus_int32 Gain_Q16,                        /* I                                       */
+    opus_int Lambda_Q10,                        /* I                                       */
+    opus_int offset_Q10,                        /* I                                       */
+    opus_int length,                            /* I    Input length                       */
+    opus_int subfr,                             /* I    Subframe number                    */
+    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
+    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
+    opus_int warping_Q16,                       /* I                                       */
+    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
+    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
+    opus_int decisionDelay                      /* I                                       */
+);
+
+void silk_NSQ_del_dec_avx2(
+    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
+    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
+    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
+    const opus_int16 x16[],                                      /* I    Input                       */
+    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
+    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
+    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
+    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
+    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
+    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
+    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
+    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
+    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
+    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
+)
+{
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[MAX_FRAME_LENGTH];
+    const opus_int8 *const pulses_a = pulses;
+
+    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
+    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
+    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
+    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
+                       pitchL, Lambda_Q10, LTP_scale_Q14);
+#endif
+
+    if (!verify_assumptions(psEncC))
+    {
+        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
+        return;
+    }
+
+    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
+    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
+    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
+    opus_int16 *pxq;
+    VARDECL(opus_int32, sLTP_Q15);
+    VARDECL(opus_int16, sLTP);
+    opus_int32 HarmShapeFIRPacked_Q14;
+    opus_int offset_Q10;
+    opus_int32 Gain_Q10;
+    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
+    opus_int32 delayedGain_Q10[DECISION_DELAY];
+    NSQ_del_dec_struct psDelDec = {0};
+    NSQ_del_dec_sample_struct *psSample;
+    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
+    SAVE_STACK;
+
+    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
+
+    /* Set unvoiced lag to the previous one, overwrite later for voiced */
+    lag = NSQ->lagPrev;
+
+    silk_assert(NSQ->prev_gain_Q16 != 0);
+    psDelDec.Seed = _mm_and_si128(
+        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
+        _mm_set1_epi32(3));
+    psDelDec.SeedInit = psDelDec.Seed;
+    psDelDec.RD_Q10 = _mm_setzero_si128();
+    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
+    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
+    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
+    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+    {
+        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
+    }
+    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
+    {
+        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
+    }
+
+    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
+    smpl_buf_idx = 0; /* index of oldest samples */
+
+    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
+
+    /* For voiced frames limit the decision delay to lower than the pitch lag */
+    if (psIndices->signalType == TYPE_VOICED)
+    {
+        for (k = 0; k < psEncC->nb_subfr; k++)
+        {
+            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
+        }
+    }
+    else
+    {
+        if (lag > 0)
+        {
+            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
+        }
+    }
+
+    if (psIndices->NLSFInterpCoef_Q2 == 4)
+    {
+        LSF_interpolation_flag = 0;
+    }
+    else
+    {
+        LSF_interpolation_flag = 1;
+    }
+
+    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
+    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
+    /* Set up pointers to start of sub frame */
+    pxq = &NSQ->xq[psEncC->ltp_mem_length];
+    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
+    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+    subfr = 0;
+    for (k = 0; k < psEncC->nb_subfr; k++)
+    {
+        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
+        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
+        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
+
+        /* Noise shape parameters */
+        silk_assert(HarmShapeGain_Q14[k] >= 0);
+        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
+        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
+
+        NSQ->rewhite_flag = 0;
+        if (psIndices->signalType == TYPE_VOICED)
+        {
+            /* Voiced */
+            lag = pitchL[k];
+
+            /* Re-whitening */
+            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
+            {
+                if (k == 2)
+                {
+                    /* RESET DELAYED DECISIONS */
+                    /* Find winner */
+                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
+                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
+                    Winner_selector = silk_index_to_selector(Winner_ind);
+                    psDelDec.RD_Q10 = _mm_add_epi32(
+                        psDelDec.RD_Q10,
+                        _mm_blendv_epi8(
+                            _mm_set1_epi32(silk_int32_MAX >> 4),
+                            _mm_setzero_si128(),
+                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
+
+                    /* Copy final part of signals from winner state to output and long-term filter states */
+                    last_smple_idx = smpl_buf_idx + decisionDelay;
+                    for (i = 0; i < decisionDelay; i++)
+                    {
+                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
+                        psSample = &psDelDec.Samples[last_smple_idx];
+                        pulses[i - decisionDelay] =
+                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
+                        pxq[i - decisionDelay] =
+                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
+                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
+                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
+                    }
+
+                    subfr = 0;
+                }
+
+                /* Rewhiten with new A coefs */
+                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
+                silk_assert(start_idx > 0);
+
+                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
+                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
+
+                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+                NSQ->rewhite_flag = 1;
+            }
+        }
+
+        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
+                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
+
+        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
+                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
+                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
+                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
+
+        x16 += psEncC->subfr_length;
+        pulses += psEncC->subfr_length;
+        pxq += psEncC->subfr_length;
+    }
+
+    /* Find winner */
+    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
+    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
+
+    /* Copy final part of signals from winner state to output and long-term filter states */
+    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
+    last_smple_idx = smpl_buf_idx + decisionDelay;
+    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
+    for (i = 0; i < decisionDelay; i++)
+    {
+        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
+        psSample = &psDelDec.Samples[last_smple_idx];
+
+        pulses[i - decisionDelay] =
+            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
+        pxq[i - decisionDelay] =
+            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
+        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
+            silk_select_winner(psSample->Shape_Q14, Winner_selector);
+    }
+    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+    {
+        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
+    }
+    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
+    {
+        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
+    }
+
+    /* Update states */
+    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
+    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
+    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
+
+    /* Save quantized speech signal */
+    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
+    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
+    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
+    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
+#endif
+
+    RESTORE_STACK;
+}
+
+static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
+{
+    __m256i out;
+    silk_assert(order == 10 || order == 16);
+
+    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+    out = _mm256_set1_epi32(order >> 1);
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
+
+    if (order == 16)
+    {
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
+    }
+    return silk_cvtepi64_epi32_high(out);
+}
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
+    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
+    NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */
+    opus_int signalType,                        /* I    Signal type                        */
+    const opus_int32 x_Q10[],                   /* I                                       */
+    opus_int8 pulses[],                         /* O                                       */
+    opus_int16 xq[],                            /* O                                       */
+    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
+    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
+    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
+    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
+    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
+    opus_int lag,                               /* I    Pitch lag                          */
+    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
+    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
+    opus_int32 LF_shp_Q14,                      /* I                                       */
+    opus_int32 Gain_Q16,                        /* I                                       */
+    opus_int Lambda_Q10,                        /* I                                       */
+    opus_int offset_Q10,                        /* I                                       */
+    opus_int length,                            /* I    Input length                       */
+    opus_int subfr,                             /* I    Subframe number                    */
+    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
+    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
+    opus_int warping_Q16,                       /* I                                       */
+    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
+    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
+    opus_int decisionDelay                      /* I                                       */
+)
+{
+    int i;
+    opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
+    opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
+    opus_int32 Gain_Q10 = Gain_Q16 >> 6;
+
+    for (i = 0; i < length; i++)
+    {
+        /* Perform common calculations used in all states */
+        /* NSQ_sample_struct */
+        /* Low  128 bits => 1st set */
+        /* High 128 bits => 2nd set */
+        int j;
+        __m256i SS_Q_Q10;
+        __m256i SS_RD_Q10;
+        __m256i SS_xq_Q14;
+        __m256i SS_LF_AR_Q14;
+        __m256i SS_Diff_Q14;
+        __m256i SS_sLTP_shp_Q14;
+        __m256i SS_LPC_exc_Q14;
+        __m256i exc_Q14;
+        __m256i q_Q10, rr_Q10, rd_Q10;
+        __m256i mask;
+        __m128i LPC_pred_Q14, n_AR_Q14;
+        __m128i RDmin_Q10, RDmax_Q10;
+        __m128i n_LF_Q14;
+        __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
+        __m128i Winner_rand_state, Winner_selector;
+        __m128i tmp0, tmp1;
+        NSQ_del_dec_sample_struct *psLastSample, *psSample;
+        opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
+        opus_int32 LTP_pred_Q14, n_LTP_Q14;
+
+        /* Long-term prediction */
+        if (signalType == TYPE_VOICED)
+        {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            LTP_pred_Q14 = 2;
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
+            LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
+            pred_lag_ptr++;
+        }
+        else
+        {
+            LTP_pred_Q14 = 0;
+        }
+
+        /* Long-term shaping */
+        if (lag > 0)
+        {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
+            n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
+            n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
+            n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
+            shp_lag_ptr++;
+        }
+        else
+        {
+            n_LTP_Q14 = 0;
+        }
+
+        /* BEGIN Updating Delayed Decision States */
+
+        /* Generate dither */
+        psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
+
+        /* Short-term prediction */
+        LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
+        LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
+
+        /* Noise shape feedback */
+        silk_assert(shapingLPCOrder > 0);
+        silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
+        /* Output of lowpass section */
+        tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
+        n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
+        for (j = 0; j < shapingLPCOrder - 1; j++)
+        {
+            /* Output of allpass section */
+            tmp1 = psDelDec->sAR2_Q14[j];
+            psDelDec->sAR2_Q14[j] = tmp0;
+            n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
+            tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
+        }
+        psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
+        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
+
+        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */
+        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
+        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */
+
+        tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
+        tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */
+        n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */
+        n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */
+
+        /* Input minus prediction plus noise feedback                       */
+        /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
+        tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */
+        tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
+        tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */
+        tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */
+
+        r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
+
+        /* Flip sign depending on dither */
+        r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
+        r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
+
+        /* Find two quantization level candidates and measure their rate-distortion */
+        q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
+        q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
+        if (Lambda_Q10 > 2048)
+        {
+            /* For aggressive RDO, the bias becomes more than one pulse. */
+            tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
+            q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
+            tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
+            tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
+            q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
+        }
+
+        tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
+        q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
+        q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
+
+        /* check if q1_Q0 is 0 or -1 */
+        tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
+        tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
+        tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
+        q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
+        q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
+
+        rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
+        rd_Q10 = _mm256_abs_epi32(q_Q10);
+        rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
+        rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
+        rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
+        rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
+
+        mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
+        SS_RD_Q10 = _mm256_add_epi32(
+            _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
+            _mm256_blendv_epi8(
+                _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
+                rd_Q10,
+                mask));
+        SS_Q_Q10 = _mm256_blendv_epi8(
+            _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
+            q_Q10,
+            mask);
+
+        /* Update states for best and second best quantization */
+
+        /* Quantized excitation */
+        exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
+
+        /* Add predictions */
+        exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
+        SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
+        SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
+
+        /* Update states */
+        SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
+        SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
+        SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
+
+        /* END Updating Delayed Decision States */
+
+        *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
+        last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
+        psLastSample = &psDelDec->Samples[last_smple_idx];
+
+        /* Find winner */
+        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
+        Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
+
+        /* Increase RD values of expired states */
+        Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
+
+        SS_RD_Q10 = _mm256_blendv_epi8(
+            _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
+            SS_RD_Q10,
+            _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
+
+        /* find worst in first set */
+        RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
+        /* find best in second set */
+        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
+
+        /* Replace a state if best from second set outperforms worst in first set */
+        tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
+        if (!_mm_test_all_zeros(tmp0, tmp0))
+        {
+            int t;
+            RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
+            RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
+            tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
+            tmp0 = _mm_blendv_epi8(
+                _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
+                silk_index_to_selector(RDmin_ind),
+                tmp1);
+            for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
+            {
+                psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
+            }
+            psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
+            psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
+            for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
+            {
+                psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
+            }
+            for (t = 0; t < DECISION_DELAY; t++)
+            {
+                psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
+                psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
+                psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
+                psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
+                psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
+            }
+            mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
+            SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
+            SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
+            SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
+            SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
+            SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
+            SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
+            SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
+        }
+
+        /* Write samples from winner to output and long-term filter states */
+        if (subfr > 0 || i >= decisionDelay)
+        {
+            pulses[i - decisionDelay] =
+                (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
+            xq[i - decisionDelay] =
+                silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
+            NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
+                silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
+            sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
+                silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
+        }
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Update states */
+        psSample = &psDelDec->Samples[*smpl_buf_idx];
+        psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
+        psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
+        psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
+        psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
+        psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
+        psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
+        psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
+        psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
+        psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
+        psSample->RandState = psDelDec->Seed;
+        delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
+    }
+    /* Update LPC states */
+    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+    {
+        psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
+    }
+}
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
+    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
+    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
+    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
+    const opus_int16 x16[],                    /* I    Input                           */
+    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
+    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
+    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
+    opus_int subfr,                            /* I    Subframe number                 */
+    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
+    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
+    const opus_int signal_type,                /* I    Signal type                     */
+    const opus_int decisionDelay               /* I    Decision delay                  */
+)
+{
+    int i;
+    opus_int lag;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+    NSQ_del_dec_sample_struct *psSample;
+
+    lag = pitchL[subfr];
+    inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
+    silk_assert(inv_gain_Q31 != 0);
+
+    /* Scale input */
+    inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
+    for (i = 0; i < psEncC->subfr_length; i+=4)
+    {
+        __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
+        x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
+        _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
+    }
+
+    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
+    if (NSQ->rewhite_flag)
+    {
+        if (subfr == 0)
+        {
+            /* Do LTP downscaling */
+            inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
+        }
+        for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
+        {
+            silk_assert(i < MAX_FRAME_LENGTH);
+            sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
+        }
+    }
+
+    /* Adjust for changing gain */
+    if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
+    {
+        gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
+
+        /* Scale long-term shaping state */
+        for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
+        {
+            __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i];
+            *p = silk_mm_smulww_epi32(*p, gain_adj_Q16);
+        }
+
+        /* Scale long-term prediction state */
+        if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
+        {
+            for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
+            {
+                sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
+            }
+        }
+
+        /* Scale scalar states */
+        psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
+        psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
+
+        /* Scale short-term prediction and shaping states */
+        for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+        {
+            psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
+        }
+        for (i = 0; i < DECISION_DELAY; i++)
+        {
+            psSample = &psDelDec->Samples[i];
+            psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
+            psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
+        }
+        for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
+        {
+            psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
+        }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[subfr];
+    }
+}
+
+static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
+    opus_int16                  *out,               /* O    Output signal                           */
+    const opus_int16            *in,                /* I    Input signal                            */
+    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
+    const opus_int32            len,                /* I    Signal length                           */
+    const opus_int32            order               /* I    Filter order                            */
+)
+{
+    int i;
+    opus_int32       out32_Q12, out32;
+    silk_assert(order == 10 || order == 16);
+
+    for(i = order; i < len; i++ )
+    {
+        const opus_int16 *in_ptr = &in[ i ];
+        /* Allowing wrap around so that two wraps can cancel each other. The rare
+           cases where the result wraps around can only be triggered by invalid streams*/
+
+        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8]));
+        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&      B[0]));
+        __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
+        if (order > 10)
+        {
+            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16]));
+            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B       [8]));
+            B_v  = silk_mm256_reverse_epi32(B_v);
+        }
+        else
+        {
+            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
+            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));
+            B_v  = _mm256_shuffle_epi32(B_v, 0x01);
+        }
+        sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
+
+        out32_Q12 = silk_mm256_hsum_epi32(sum);
+
+        /* Subtract prediction */
+        out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
+
+        /* Scale to Q0 */
+        out32 = silk_sar_round_32(out32_Q12, 12);
+
+        /* Saturate output */
+        out[ i ] = silk_sat16(out32);
+    }
+
+    /* Set first d output samples to zero */
+    silk_memset( out, 0, order * sizeof( opus_int16 ) );
+}
diff --git a/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c b/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c
index 3521cae703..5937682d2a 100644
--- a/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c
@@ -119,7 +119,7 @@ void silk_NSQ_del_dec_sse4_1(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -428,7 +428,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
             LTP_pred_Q14 = 2;
             {
                 __m128i tmpa, tmpb, pred_lag_ptr_tmp;
-                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
                 pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
                 tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
                 tmpa                = _mm_srli_si128( tmpa, 2 );
@@ -483,7 +483,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 tmpb = _mm_setzero_si128();
 
                 /* step 1 */
-                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
                 psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
                 tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
 
@@ -497,7 +497,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
 
                 /* step 2 */
-                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) );
                 psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                 tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
                 tmpa            = _mm_srli_epi64( tmpa, 16 );
@@ -512,7 +512,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 if ( opus_likely( predictLPCOrder == 16 ) )
                 {
                     /* step 3 */
-                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) );
                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
                     tmpa            = _mm_srli_epi64( tmpa, 16 );
@@ -525,7 +525,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
 
                     /* step 4 */
-                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) );
                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
                     tmpa            = _mm_srli_epi64( tmpa, 16 );
@@ -830,7 +830,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
         xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
+        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
@@ -862,7 +862,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
             {
-                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
                 /* equal shift right 4 bytes*/
                 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
@@ -874,7 +874,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
                 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
 
-                _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+                _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
             }
 
             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
diff --git a/media/libopus/silk/x86/NSQ_sse4_1.c b/media/libopus/silk/x86/NSQ_sse4_1.c
index d5ae1d3b1c..3c9aca7ba1 100644
--- a/media/libopus/silk/x86/NSQ_sse4_1.c
+++ b/media/libopus/silk/x86/NSQ_sse4_1.c
@@ -77,7 +77,7 @@ void silk_NSQ_sse4_1(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -338,21 +338,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
 
     /* load a_Q12[0] - a_Q12[7] */
-    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
+    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) );
     /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
-    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
+    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) );
 
     a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
     a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
 
     /* load AR_shp_Q13 */
-    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
+    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) );
 
     /* load psLPC_Q14 */
     xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
 
-    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
-    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) );
 
     xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
     xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
@@ -360,8 +360,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
     psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
 
-    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
-    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) );
 
     xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
     xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
@@ -370,8 +370,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
 
     /* load sAR2_Q14 */
-    xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
-    xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) );
 
     xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
     xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
@@ -443,7 +443,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
                 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
 
                 /* loaded: [0] [-1] [-2] [-3] */
-                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
                 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
                 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
                 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
@@ -595,8 +595,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     /* write back sAR2_Q14 */
     xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
     xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
-    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
-    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
+    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
+    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
 
     /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
     {
@@ -612,8 +612,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         /* process xq */
         for (i = 0; i < length - 7; i += 8)
         {
-            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
-            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
+            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) );
+            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) );
 
             /* equal shift right 4 bytes*/
             xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
@@ -644,7 +644,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
             xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
 
             /* save to xq */
-            _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
+            _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 );
         }
     }
     for ( ; i < length; i++)
@@ -698,7 +698,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
 
         xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
+        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
@@ -729,7 +729,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
 
         for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
         {
-            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
             /* equal shift right 4 bytes*/
             xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
@@ -741,7 +741,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
 
             xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
 
-            _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+            _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
         }
 
         for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
diff --git a/media/libopus/silk/x86/VAD_sse4_1.c b/media/libopus/silk/x86/VAD_sse4_1.c
index e7eaf9714a..9e06bc79d0 100644
--- a/media/libopus/silk/x86/VAD_sse4_1.c
+++ b/media/libopus/silk/x86/VAD_sse4_1.c
@@ -144,7 +144,7 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
 
             for( i = 0; i < dec_subframe_length - 7; i += 8 )
             {
-                xmm_X   = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
+                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
                 xmm_X   = _mm_srai_epi16( xmm_X, 3 );
                 xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
                 xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
diff --git a/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c b/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
index 2c7d18d05e..df4626b60a 100644
--- a/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -65,7 +65,7 @@ void silk_VQ_WMat_EC_sse4_1(
     neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
     neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
 
-    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
+    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(void*)(&XX_Q17[ 1 ] ) );
     v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
     /* Loop over codebook */
diff --git a/media/libopus/silk/x86/main_sse.h b/media/libopus/silk/x86/main_sse.h
index a01d7f6c75..b254d53e7a 100644
--- a/media/libopus/silk/x86/main_sse.h
+++ b/media/libopus/silk/x86/main_sse.h
@@ -88,7 +88,7 @@ void silk_NSQ_sse4_1(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -116,7 +116,7 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -142,7 +142,7 @@ void silk_NSQ_del_dec_sse4_1(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -154,7 +154,33 @@ void silk_NSQ_del_dec_sse4_1(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  if defined OPUS_X86_PRESUME_SSE4_1
+void silk_NSQ_del_dec_avx2(
+    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
+    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
+    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
+    const opus_int16 x16[],                                      /* I    Input                       */
+    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
+    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
+    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
+    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
+    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
+    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
+    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
+    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
+    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
+    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
+);
+
+#  if defined (OPUS_X86_PRESUME_AVX2)
+
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#  elif defined (OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
 
 #   define OVERRIDE_silk_NSQ_del_dec
 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
@@ -170,7 +196,7 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -243,5 +269,31 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
 
 #  endif
 
+#ifndef FIXED_POINT
+double silk_inner_product_FLP_avx2(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+);
+
+#if defined (OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_inner_product_FLP
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize))
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_inner_product_FLP
+extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+);
+
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize))
+
+#endif
+#endif
+
 # endif
 #endif
diff --git a/media/libopus/silk/x86/x86_silk_map.c b/media/libopus/silk/x86/x86_silk_map.c
index 70f60078cf..39ad75276c 100644
--- a/media/libopus/silk/x86/x86_silk_map.c
+++ b/media/libopus/silk/x86/x86_silk_map.c
@@ -32,10 +32,13 @@
 #include "celt/x86/x86cpu.h"
 #include "structs.h"
 #include "SigProc_FIX.h"
+#ifndef FIXED_POINT
+#include "SigProc_FLP.h"
+#endif
 #include "pitch.h"
 #include "main.h"
 
-#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_SSE4_1)
+#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_AVX2)
 
 #if defined(FIXED_POINT)
 
@@ -72,7 +75,7 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -117,7 +120,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
@@ -132,7 +135,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_NSQ_del_dec_c,
   silk_NSQ_del_dec_c,
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
+  MAY_HAVE_AVX2( silk_NSQ_del_dec )  /* avx */
 };
 
 #if defined(FIXED_POINT)
@@ -156,4 +159,21 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };
 
 #endif
+
+#ifndef FIXED_POINT
+
+double (*const SILK_INNER_PRODUCT_FLP_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+) = {
+  silk_inner_product_FLP_c,                  /* non-sse */
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c, /* sse4.1 */
+  MAY_HAVE_AVX2( silk_inner_product_FLP )  /* avx */
+};
+
+#endif
+
 #endif
-- 
cgit v1.2.3