From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 19 Apr 2024 03:14:29 +0200
Subject: Merging upstream version 125.0.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 media/libopus/silk/float/SigProc_FLP.h             | 14 +++-
 media/libopus/silk/float/autocorrelation_FLP.c     |  5 +-
 media/libopus/silk/float/burg_modified_FLP.c       |  5 +-
 media/libopus/silk/float/corrMatrix_FLP.c          | 10 ++-
 media/libopus/silk/float/encode_frame_FLP.c        | 17 ++---
 media/libopus/silk/float/find_LPC_FLP.c            |  7 +-
 media/libopus/silk/float/find_LTP_FLP.c            |  7 +-
 media/libopus/silk/float/find_pitch_lags_FLP.c     |  2 +-
 media/libopus/silk/float/find_pred_coefs_FLP.c     |  4 +-
 media/libopus/silk/float/inner_product_FLP.c       |  2 +-
 media/libopus/silk/float/main_FLP.h                | 12 ++-
 .../libopus/silk/float/noise_shape_analysis_FLP.c  |  2 +-
 media/libopus/silk/float/pitch_analysis_core_FLP.c |  2 +-
 .../silk/float/warped_autocorrelation_FLP.c        |  6 +-
 .../silk/float/x86/inner_product_FLP_avx2.c        | 85 ++++++++++++++++++++++
 15 files changed, 141 insertions(+), 39 deletions(-)
 create mode 100644 media/libopus/silk/float/x86/inner_product_FLP_avx2.c

(limited to 'media/libopus/silk/float')

diff --git a/media/libopus/silk/float/SigProc_FLP.h b/media/libopus/silk/float/SigProc_FLP.h
index 953de8b09e..ff9281b852 100644
--- a/media/libopus/silk/float/SigProc_FLP.h
+++ b/media/libopus/silk/float/SigProc_FLP.h
@@ -30,6 +30,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 #include "float_cast.h"
+#include "main.h"
 #include <math.h>
 
 #ifdef  __cplusplus
@@ -73,7 +74,8 @@ void silk_autocorrelation_FLP(
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 );
 
 opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced, 1 unvoiced                      */
@@ -105,7 +107,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 );
 
 /* multiply a vector by a constant */
@@ -124,12 +127,17 @@ void silk_scale_copy_vector_FLP(
 );
 
 /* inner product of two silk_float arrays, with result as double */
-double silk_inner_product_FLP(
+double silk_inner_product_FLP_c(
     const silk_float    *data1,
     const silk_float    *data2,
     opus_int            dataSize
 );
 
+#ifndef OVERRIDE_inner_product_FLP
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize))
+#endif
+
+
 /* sum of squares of a silk_float array, with result as double */
 double silk_energy_FLP(
     const silk_float    *data,
diff --git a/media/libopus/silk/float/autocorrelation_FLP.c b/media/libopus/silk/float/autocorrelation_FLP.c
index 8b8a9e659a..4253b26ebc 100644
--- a/media/libopus/silk/float/autocorrelation_FLP.c
+++ b/media/libopus/silk/float/autocorrelation_FLP.c
@@ -37,7 +37,8 @@ void silk_autocorrelation_FLP(
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 )
 {
     opus_int i;
@@ -47,6 +48,6 @@ void silk_autocorrelation_FLP(
     }
 
     for( i = 0; i < correlationCount; i++ ) {
-        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i );
+        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch );
     }
 }
diff --git a/media/libopus/silk/float/burg_modified_FLP.c b/media/libopus/silk/float/burg_modified_FLP.c
index 756b76a35b..f5bef5ddbe 100644
--- a/media/libopus/silk/float/burg_modified_FLP.c
+++ b/media/libopus/silk/float/burg_modified_FLP.c
@@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 )
 {
     opus_int         k, n, s, reached_max_gain;
@@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     for( s = 0; s < nb_subfr; s++ ) {
         x_ptr = x + s * subfr_length;
         for( n = 1; n < D + 1; n++ ) {
-            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch );
         }
     }
     silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
diff --git a/media/libopus/silk/float/corrMatrix_FLP.c b/media/libopus/silk/float/corrMatrix_FLP.c
index eae6a1cfca..eef6e8aa79 100644
--- a/media/libopus/silk/float/corrMatrix_FLP.c
+++ b/media/libopus/silk/float/corrMatrix_FLP.c
@@ -41,7 +41,8 @@ void silk_corrVector_FLP(
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 )
 {
     opus_int lag;
@@ -50,7 +51,7 @@ void silk_corrVector_FLP(
     ptr1 = &x[ Order - 1 ];                     /* Points to first sample of column 0 of X: X[:,0] */
     for( lag = 0; lag < Order; lag++ ) {
         /* Calculate X[:,lag]'*t */
-        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L );
+        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch );
         ptr1--;                                 /* Next column of X */
     }
 }
@@ -60,7 +61,8 @@ void silk_corrMatrix_FLP(
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 )
 {
     opus_int j, lag;
@@ -79,7 +81,7 @@ void silk_corrMatrix_FLP(
     ptr2 = &x[ Order - 2 ];                     /* First sample of column 1 of X */
     for( lag = 1; lag < Order; lag++ ) {
         /* Calculate X[:,0]'*X[:,lag] */
-        energy = silk_inner_product_FLP( ptr1, ptr2, L );
+        energy = silk_inner_product_FLP( ptr1, ptr2, L, arch );
         matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy;
         matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy;
         /* Calculate X[:,j]'*X[:,j + lag] */
diff --git a/media/libopus/silk/float/encode_frame_FLP.c b/media/libopus/silk/float/encode_frame_FLP.c
index b029c3f5ca..8a327c5626 100644
--- a/media/libopus/silk/float/encode_frame_FLP.c
+++ b/media/libopus/silk/float/encode_frame_FLP.c
@@ -107,7 +107,10 @@ opus_int silk_encode_frame_FLP(
     opus_int     gain_lock[ MAX_NB_SUBFR ] = {0};
     opus_int16   best_gain_mult[ MAX_NB_SUBFR ];
     opus_int     best_sum[ MAX_NB_SUBFR ];
+    opus_int     bits_margin;
 
+    /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */
+    bits_margin = useCBR ? 5 : maxBits/4;
     /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */
     LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0;
 
@@ -270,7 +273,7 @@ opus_int silk_encode_frame_FLP(
                     gainMult_upper = gainMult_Q8;
                     gainsID_upper = gainsID;
                 }
-            } else if( nBits < maxBits - 5 ) {
+            } else if( nBits < maxBits - bits_margin ) {
                 found_lower = 1;
                 nBits_lower = nBits;
                 gainMult_lower = gainMult_Q8;
@@ -284,7 +287,7 @@ opus_int silk_encode_frame_FLP(
                     LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
                 }
             } else {
-                /* Within 5 bits of budget: close enough */
+                /* Close enough */
                 break;
             }
 
@@ -306,15 +309,9 @@ opus_int silk_encode_frame_FLP(
             if( ( found_lower & found_upper ) == 0 ) {
                 /* Adjust gain according to high-rate rate/distortion curve */
                 if( nBits > maxBits ) {
-                    if (gainMult_Q8 < 16384) {
-                        gainMult_Q8 *= 2;
-                    } else {
-                        gainMult_Q8 = 32767;
-                    }
+                    gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 );
                 } else {
-                    opus_int32 gain_factor_Q16;
-                    gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) );
-                    gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 );
+                    gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 );
                 }
             } else {
                 /* Adjust gain by interpolating */
diff --git a/media/libopus/silk/float/find_LPC_FLP.c b/media/libopus/silk/float/find_LPC_FLP.c
index fa3ffe7f8b..6ccd711dc3 100644
--- a/media/libopus/silk/float/find_LPC_FLP.c
+++ b/media/libopus/silk/float/find_LPC_FLP.c
@@ -38,7 +38,8 @@ void silk_find_LPC_FLP(
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Inverse of max prediction gain              */
+    const silk_float                minInvGain,                         /* I    Inverse of max prediction gain              */
+    int                             arch
 )
 {
     opus_int    k, subfr_length;
@@ -56,12 +57,12 @@ void silk_find_LPC_FLP(
     psEncC->indices.NLSFInterpCoef_Q2 = 4;
 
     /* Burg AR analysis for the full frame */
-    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );
+    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch );
 
     if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {
         /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than        */
         /* adding it to the residual energy of the first 10 ms in each iteration of the search below    */
-        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder );
+        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch );
 
         /* Convert to NLSFs */
         silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder );
diff --git a/media/libopus/silk/float/find_LTP_FLP.c b/media/libopus/silk/float/find_LTP_FLP.c
index f97064930e..90aeeac0b7 100644
--- a/media/libopus/silk/float/find_LTP_FLP.c
+++ b/media/libopus/silk/float/find_LTP_FLP.c
@@ -38,7 +38,8 @@ void silk_find_LTP_FLP(
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[ MAX_NB_SUBFR ],                /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 )
 {
     opus_int   k;
@@ -50,8 +51,8 @@ void silk_find_LTP_FLP(
     XX_ptr = XX;
     for( k = 0; k < nb_subfr; k++ ) {
         lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 );
-        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr );
-        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr );
+        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch );
+        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch );
         xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER );
         temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f );
         silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER );
diff --git a/media/libopus/silk/float/find_pitch_lags_FLP.c b/media/libopus/silk/float/find_pitch_lags_FLP.c
index dedbcd2836..1f6bd5991c 100644
--- a/media/libopus/silk/float/find_pitch_lags_FLP.c
+++ b/media/libopus/silk/float/find_pitch_lags_FLP.c
@@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP(
     silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );
 
     /* Calculate autocorrelation sequence */
-    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );
+    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );
 
     /* Add white noise, as a fraction of the energy */
     auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1;
diff --git a/media/libopus/silk/float/find_pred_coefs_FLP.c b/media/libopus/silk/float/find_pred_coefs_FLP.c
index 6f79078893..f3c54cf474 100644
--- a/media/libopus/silk/float/find_pred_coefs_FLP.c
+++ b/media/libopus/silk/float/find_pred_coefs_FLP.c
@@ -63,7 +63,7 @@ void silk_find_pred_coefs_FLP(
         celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );
 
         /* LTP analysis */
-        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr );
+        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch );
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
@@ -102,7 +102,7 @@ void silk_find_pred_coefs_FLP(
     }
 
     /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
-    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain );
+    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch );
 
     /* Quantize LSFs */
     silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 );
diff --git a/media/libopus/silk/float/inner_product_FLP.c b/media/libopus/silk/float/inner_product_FLP.c
index cdd39d24ce..88b160ab40 100644
--- a/media/libopus/silk/float/inner_product_FLP.c
+++ b/media/libopus/silk/float/inner_product_FLP.c
@@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "SigProc_FLP.h"
 
 /* inner product of two silk_float arrays, with result as double */
-double silk_inner_product_FLP(
+double silk_inner_product_FLP_c(
     const silk_float    *data1,
     const silk_float    *data2,
     opus_int            dataSize
diff --git a/media/libopus/silk/float/main_FLP.h b/media/libopus/silk/float/main_FLP.h
index 5dc0ccf4a4..2e4435cc68 100644
--- a/media/libopus/silk/float/main_FLP.h
+++ b/media/libopus/silk/float/main_FLP.h
@@ -138,7 +138,8 @@ void silk_find_LPC_FLP(
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Prediction gain from LTP (dB)               */
+    const silk_float                minInvGain,                         /* I    Prediction gain from LTP (dB)               */
+    int                             arch
 );
 
 /* LTP analysis */
@@ -148,7 +149,8 @@ void silk_find_LTP_FLP(
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[  MAX_NB_SUBFR ],               /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 );
 
 void silk_LTP_analysis_filter_FLP(
@@ -221,7 +223,8 @@ void silk_corrMatrix_FLP(
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 );
 
 /* Calculates correlation vector X'*t */
@@ -230,7 +233,8 @@ void silk_corrVector_FLP(
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 );
 
 /* Apply sine window to signal vector.  */
diff --git a/media/libopus/silk/float/noise_shape_analysis_FLP.c b/media/libopus/silk/float/noise_shape_analysis_FLP.c
index cb3d8a50b7..0b5ea95218 100644
--- a/media/libopus/silk/float/noise_shape_analysis_FLP.c
+++ b/media/libopus/silk/float/noise_shape_analysis_FLP.c
@@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP(
                 psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
         } else {
             /* Calculate regular auto correlation */
-            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );
+            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch );
         }
 
         /* Add white noise, as a fraction of energy */
diff --git a/media/libopus/silk/float/pitch_analysis_core_FLP.c b/media/libopus/silk/float/pitch_analysis_core_FLP.c
index f351bc3718..0530a8831a 100644
--- a/media/libopus/silk/float/pitch_analysis_core_FLP.c
+++ b/media/libopus/silk/float/pitch_analysis_core_FLP.c
@@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
-            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz );
+            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch );
             if( cross_corr > 0.0f ) {
                 energy = silk_energy_FLP( basis_ptr, sf_length_8kHz );
                 C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) );
diff --git a/media/libopus/silk/float/warped_autocorrelation_FLP.c b/media/libopus/silk/float/warped_autocorrelation_FLP.c
index 09186e73d4..116dab923f 100644
--- a/media/libopus/silk/float/warped_autocorrelation_FLP.c
+++ b/media/libopus/silk/float/warped_autocorrelation_FLP.c
@@ -54,11 +54,13 @@ void silk_warped_autocorrelation_FLP(
         /* Loop over allpass sections */
         for( i = 0; i < order; i += 2 ) {
             /* Output of allpass section */
-            tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 );
+            /* We voluntarily use two multiples instead of factoring the expression to
+               reduce the length of the dependency chain (tmp1->tmp2->tmp1... ). */
+            tmp2 = state[ i ] + warping * state[ i + 1 ] - warping * tmp1;
             state[ i ] = tmp1;
             C[ i ] += state[ 0 ] * tmp1;
             /* Output of allpass section */
-            tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 );
+            tmp1 = state[ i + 1 ] + warping * state[ i + 2 ] - warping * tmp2;
             state[ i + 1 ] = tmp2;
             C[ i + 1 ] += state[ 0 ] * tmp2;
         }
diff --git a/media/libopus/silk/float/x86/inner_product_FLP_avx2.c b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c
new file mode 100644
index 0000000000..4a2daaf595
--- /dev/null
+++ b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c
@@ -0,0 +1,85 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+              2023 Amazon
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "SigProc_FLP.h"
+#include <immintrin.h>
+
+
+/* inner product of two silk_float arrays, with result as double */
+double silk_inner_product_FLP_avx2(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+)
+{
+    opus_int i;
+    __m256d accum1, accum2;
+    double   result;
+
+    /* 4x unrolled loop */
+    result = 0.0;
+    accum1 = accum2 = _mm256_setzero_pd();
+    for( i = 0; i < dataSize - 7; i += 8 ) {
+        __m128  x1f, x2f;
+        __m256d x1d, x2d;
+        x1f = _mm_loadu_ps( &data1[ i ] );
+        x2f = _mm_loadu_ps( &data2[ i ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 );
+        x1f = _mm_loadu_ps( &data1[ i + 4 ] );
+        x2f = _mm_loadu_ps( &data2[ i + 4 ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum2 = _mm256_fmadd_pd( x1d, x2d, accum2 );
+    }
+    for( ; i < dataSize - 3; i += 4 ) {
+        __m128  x1f, x2f;
+        __m256d x1d, x2d;
+        x1f = _mm_loadu_ps( &data1[ i ] );
+        x2f = _mm_loadu_ps( &data2[ i ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 );
+    }
+    accum1 = _mm256_add_pd(accum1, accum2);
+    accum1 = _mm256_add_pd(accum1, _mm256_permute2f128_pd(accum1, accum1, 1));
+    accum1 = _mm256_hadd_pd(accum1,accum1);
+    result = _mm256_cvtsd_f64(accum1);
+
+    /* add any remaining products */
+    for( ; i < dataSize; i++ ) {
+        result += data1[ i ] * (double)data2[ i ];
+    }
+
+    return result;
+}
-- 
cgit v1.2.3