From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:14:29 +0200 Subject: Merging upstream version 125.0.1. Signed-off-by: Daniel Baumann --- media/libopus/silk/float/SigProc_FLP.h | 14 +++- media/libopus/silk/float/autocorrelation_FLP.c | 5 +- media/libopus/silk/float/burg_modified_FLP.c | 5 +- media/libopus/silk/float/corrMatrix_FLP.c | 10 ++- media/libopus/silk/float/encode_frame_FLP.c | 17 ++--- media/libopus/silk/float/find_LPC_FLP.c | 7 +- media/libopus/silk/float/find_LTP_FLP.c | 7 +- media/libopus/silk/float/find_pitch_lags_FLP.c | 2 +- media/libopus/silk/float/find_pred_coefs_FLP.c | 4 +- media/libopus/silk/float/inner_product_FLP.c | 2 +- media/libopus/silk/float/main_FLP.h | 12 ++- .../libopus/silk/float/noise_shape_analysis_FLP.c | 2 +- media/libopus/silk/float/pitch_analysis_core_FLP.c | 2 +- .../silk/float/warped_autocorrelation_FLP.c | 6 +- .../silk/float/x86/inner_product_FLP_avx2.c | 85 ++++++++++++++++++++++ 15 files changed, 141 insertions(+), 39 deletions(-) create mode 100644 media/libopus/silk/float/x86/inner_product_FLP_avx2.c (limited to 'media/libopus/silk/float') diff --git a/media/libopus/silk/float/SigProc_FLP.h b/media/libopus/silk/float/SigProc_FLP.h index 953de8b09e..ff9281b852 100644 --- a/media/libopus/silk/float/SigProc_FLP.h +++ b/media/libopus/silk/float/SigProc_FLP.h @@ -30,6 +30,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" #include "float_cast.h" +#include "main.h" #include #ifdef __cplusplus @@ -73,7 +74,8 @@ void silk_autocorrelation_FLP( silk_float *results, /* O result (length correlationCount) */ const silk_float *inputData, /* I input data to correlate */ opus_int inputDataSize, /* I length of input */ - opus_int correlationCount /* I number of correlation taps to compute */ + opus_int correlationCount, /* I number of correlation taps to compute */ + int arch ); opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, 1 unvoiced */ @@ -105,7 +107,8 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy const silk_float minInvGain, /* I minimum inverse prediction gain */ const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I number of subframes stacked in x */ - const opus_int D /* I order */ + const opus_int D, /* I order */ + int arch ); /* multiply a vector by a constant */ @@ -124,12 +127,17 @@ void silk_scale_copy_vector_FLP( ); /* inner product of two silk_float arrays, with result as double */ -double silk_inner_product_FLP( +double silk_inner_product_FLP_c( const silk_float *data1, const silk_float *data2, opus_int dataSize ); +#ifndef OVERRIDE_inner_product_FLP +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize)) +#endif + + /* sum of squares of a silk_float array, with result as double */ double silk_energy_FLP( const silk_float *data, diff --git a/media/libopus/silk/float/autocorrelation_FLP.c b/media/libopus/silk/float/autocorrelation_FLP.c index 8b8a9e659a..4253b26ebc 100644 --- a/media/libopus/silk/float/autocorrelation_FLP.c +++ b/media/libopus/silk/float/autocorrelation_FLP.c @@ -37,7 +37,8 @@ void silk_autocorrelation_FLP( silk_float *results, /* O result (length correlationCount) */ const silk_float *inputData, /* I input data to correlate */ opus_int inputDataSize, /* I length of input */ - opus_int correlationCount /* I number of correlation taps to compute */ + opus_int correlationCount, /* I number of correlation taps to compute */ + int arch ) { opus_int i; @@ -47,6 +48,6 @@ void silk_autocorrelation_FLP( } for( i = 0; i < correlationCount; i++ ) { - results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i ); + results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch ); } } diff --git a/media/libopus/silk/float/burg_modified_FLP.c b/media/libopus/silk/float/burg_modified_FLP.c index 756b76a35b..f5bef5ddbe 100644 --- a/media/libopus/silk/float/burg_modified_FLP.c +++ b/media/libopus/silk/float/burg_modified_FLP.c @@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy const silk_float minInvGain, /* I minimum inverse prediction gain */ const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I number of subframes stacked in x */ - const opus_int D /* I order */ + const opus_int D, /* I order */ + int arch ) { opus_int k, n, s, reached_max_gain; @@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { - C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n ); + C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch ); } } silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) ); diff --git a/media/libopus/silk/float/corrMatrix_FLP.c b/media/libopus/silk/float/corrMatrix_FLP.c index eae6a1cfca..eef6e8aa79 100644 --- a/media/libopus/silk/float/corrMatrix_FLP.c +++ b/media/libopus/silk/float/corrMatrix_FLP.c @@ -41,7 +41,8 @@ void silk_corrVector_FLP( const silk_float *t, /* I Target vector [L] */ const opus_int L, /* I Length of vecors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *Xt /* O X'*t correlation vector [order] */ + silk_float *Xt, /* O X'*t correlation vector [order] */ + int arch ) { opus_int lag; @@ -50,7 +51,7 @@ void silk_corrVector_FLP( ptr1 = &x[ Order - 1 ]; /* Points to first sample of column 0 of X: X[:,0] */ for( lag = 0; lag < Order; lag++ ) { /* Calculate X[:,lag]'*t */ - Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L ); + Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch ); ptr1--; /* Next column of X */ } } @@ -60,7 +61,8 @@ void silk_corrMatrix_FLP( const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ const opus_int L, /* I Length of vectors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *XX /* O X'*X correlation matrix [order x order] */ + silk_float *XX, /* O X'*X correlation matrix [order x order] */ + int arch ) { opus_int j, lag; @@ -79,7 +81,7 @@ void silk_corrMatrix_FLP( ptr2 = &x[ Order - 2 ]; /* First sample of column 1 of X */ for( lag = 1; lag < Order; lag++ ) { /* Calculate X[:,0]'*X[:,lag] */ - energy = silk_inner_product_FLP( ptr1, ptr2, L ); + energy = silk_inner_product_FLP( ptr1, ptr2, L, arch ); matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy; matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy; /* Calculate X[:,j]'*X[:,j + lag] */ diff --git a/media/libopus/silk/float/encode_frame_FLP.c b/media/libopus/silk/float/encode_frame_FLP.c index b029c3f5ca..8a327c5626 100644 --- a/media/libopus/silk/float/encode_frame_FLP.c +++ b/media/libopus/silk/float/encode_frame_FLP.c @@ -107,7 +107,10 @@ opus_int silk_encode_frame_FLP( opus_int gain_lock[ MAX_NB_SUBFR ] = {0}; opus_int16 best_gain_mult[ MAX_NB_SUBFR ]; opus_int best_sum[ MAX_NB_SUBFR ]; + opus_int bits_margin; + /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */ + bits_margin = useCBR ? 5 : maxBits/4; /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; @@ -270,7 +273,7 @@ opus_int silk_encode_frame_FLP( gainMult_upper = gainMult_Q8; gainsID_upper = gainsID; } - } else if( nBits < maxBits - 5 ) { + } else if( nBits < maxBits - bits_margin ) { found_lower = 1; nBits_lower = nBits; gainMult_lower = gainMult_Q8; @@ -284,7 +287,7 @@ opus_int silk_encode_frame_FLP( LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; } } else { - /* Within 5 bits of budget: close enough */ + /* Close enough */ break; } @@ -306,15 +309,9 @@ opus_int silk_encode_frame_FLP( if( ( found_lower & found_upper ) == 0 ) { /* Adjust gain according to high-rate rate/distortion curve */ if( nBits > maxBits ) { - if (gainMult_Q8 < 16384) { - gainMult_Q8 *= 2; - } else { - gainMult_Q8 = 32767; - } + gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 ); } else { - opus_int32 gain_factor_Q16; - gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); - gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 ); } } else { /* Adjust gain by interpolating */ diff --git a/media/libopus/silk/float/find_LPC_FLP.c b/media/libopus/silk/float/find_LPC_FLP.c index fa3ffe7f8b..6ccd711dc3 100644 --- a/media/libopus/silk/float/find_LPC_FLP.c +++ b/media/libopus/silk/float/find_LPC_FLP.c @@ -38,7 +38,8 @@ void silk_find_LPC_FLP( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 NLSF_Q15[], /* O NLSFs */ const silk_float x[], /* I Input signal */ - const silk_float minInvGain /* I Inverse of max prediction gain */ + const silk_float minInvGain, /* I Inverse of max prediction gain */ + int arch ) { opus_int k, subfr_length; @@ -56,12 +57,12 @@ void silk_find_LPC_FLP( psEncC->indices.NLSFInterpCoef_Q2 = 4; /* Burg AR analysis for the full frame */ - res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder ); + res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch ); if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) { /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than */ /* adding it to the residual energy of the first 10 ms in each iteration of the search below */ - res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder ); + res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch ); /* Convert to NLSFs */ silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder ); diff --git a/media/libopus/silk/float/find_LTP_FLP.c b/media/libopus/silk/float/find_LTP_FLP.c index f97064930e..90aeeac0b7 100644 --- a/media/libopus/silk/float/find_LTP_FLP.c +++ b/media/libopus/silk/float/find_LTP_FLP.c @@ -38,7 +38,8 @@ void silk_find_LTP_FLP( const silk_float r_ptr[], /* I LPC residual */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ - const opus_int nb_subfr /* I number of subframes */ + const opus_int nb_subfr, /* I number of subframes */ + int arch ) { opus_int k; @@ -50,8 +51,8 @@ void silk_find_LTP_FLP( XX_ptr = XX; for( k = 0; k < nb_subfr; k++ ) { lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 ); - silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr ); - silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr ); + silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch ); + silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch ); xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER ); temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f ); silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER ); diff --git a/media/libopus/silk/float/find_pitch_lags_FLP.c b/media/libopus/silk/float/find_pitch_lags_FLP.c index dedbcd2836..1f6bd5991c 100644 --- a/media/libopus/silk/float/find_pitch_lags_FLP.c +++ b/media/libopus/silk/float/find_pitch_lags_FLP.c @@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP( silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch ); /* Calculate autocorrelation sequence */ - silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 ); + silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch ); /* Add white noise, as a fraction of the energy */ auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1; diff --git a/media/libopus/silk/float/find_pred_coefs_FLP.c b/media/libopus/silk/float/find_pred_coefs_FLP.c index 6f79078893..f3c54cf474 100644 --- a/media/libopus/silk/float/find_pred_coefs_FLP.c +++ b/media/libopus/silk/float/find_pred_coefs_FLP.c @@ -63,7 +63,7 @@ void silk_find_pred_coefs_FLP( celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 ); /* LTP analysis */ - silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr ); + silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch ); /* Quantize LTP gain parameters */ silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, @@ -102,7 +102,7 @@ void silk_find_pred_coefs_FLP( } /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ - silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain ); + silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch ); /* Quantize LSFs */ silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 ); diff --git a/media/libopus/silk/float/inner_product_FLP.c b/media/libopus/silk/float/inner_product_FLP.c index cdd39d24ce..88b160ab40 100644 --- a/media/libopus/silk/float/inner_product_FLP.c +++ b/media/libopus/silk/float/inner_product_FLP.c @@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FLP.h" /* inner product of two silk_float arrays, with result as double */ -double silk_inner_product_FLP( +double silk_inner_product_FLP_c( const silk_float *data1, const silk_float *data2, opus_int dataSize diff --git a/media/libopus/silk/float/main_FLP.h b/media/libopus/silk/float/main_FLP.h index 5dc0ccf4a4..2e4435cc68 100644 --- a/media/libopus/silk/float/main_FLP.h +++ b/media/libopus/silk/float/main_FLP.h @@ -138,7 +138,8 @@ void silk_find_LPC_FLP( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 NLSF_Q15[], /* O NLSFs */ const silk_float x[], /* I Input signal */ - const silk_float minInvGain /* I Prediction gain from LTP (dB) */ + const silk_float minInvGain, /* I Prediction gain from LTP (dB) */ + int arch ); /* LTP analysis */ @@ -148,7 +149,8 @@ void silk_find_LTP_FLP( const silk_float r_ptr[], /* I LPC residual */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ - const opus_int nb_subfr /* I number of subframes */ + const opus_int nb_subfr, /* I number of subframes */ + int arch ); void silk_LTP_analysis_filter_FLP( @@ -221,7 +223,8 @@ void silk_corrMatrix_FLP( const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ const opus_int L, /* I Length of vectors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *XX /* O X'*X correlation matrix [order x order] */ + silk_float *XX, /* O X'*X correlation matrix [order x order] */ + int arch ); /* Calculates correlation vector X'*t */ @@ -230,7 +233,8 @@ void silk_corrVector_FLP( const silk_float *t, /* I Target vector [L] */ const opus_int L, /* I Length of vecors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *Xt /* O X'*t correlation vector [order] */ + silk_float *Xt, /* O X'*t correlation vector [order] */ + int arch ); /* Apply sine window to signal vector. */ diff --git a/media/libopus/silk/float/noise_shape_analysis_FLP.c b/media/libopus/silk/float/noise_shape_analysis_FLP.c index cb3d8a50b7..0b5ea95218 100644 --- a/media/libopus/silk/float/noise_shape_analysis_FLP.c +++ b/media/libopus/silk/float/noise_shape_analysis_FLP.c @@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP( psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); } else { /* Calculate regular auto correlation */ - silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 ); + silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch ); } /* Add white noise, as a fraction of energy */ diff --git a/media/libopus/silk/float/pitch_analysis_core_FLP.c b/media/libopus/silk/float/pitch_analysis_core_FLP.c index f351bc3718..0530a8831a 100644 --- a/media/libopus/silk/float/pitch_analysis_core_FLP.c +++ b/media/libopus/silk/float/pitch_analysis_core_FLP.c @@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, for( j = 0; j < length_d_comp; j++ ) { d = d_comp[ j ]; basis_ptr = target_ptr - d; - cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz ); + cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch ); if( cross_corr > 0.0f ) { energy = silk_energy_FLP( basis_ptr, sf_length_8kHz ); C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) ); diff --git a/media/libopus/silk/float/warped_autocorrelation_FLP.c b/media/libopus/silk/float/warped_autocorrelation_FLP.c index 09186e73d4..116dab923f 100644 --- a/media/libopus/silk/float/warped_autocorrelation_FLP.c +++ b/media/libopus/silk/float/warped_autocorrelation_FLP.c @@ -54,11 +54,13 @@ void silk_warped_autocorrelation_FLP( /* Loop over allpass sections */ for( i = 0; i < order; i += 2 ) { /* Output of allpass section */ - tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 ); + /* We voluntarily use two multiples instead of factoring the expression to + reduce the length of the dependency chain (tmp1->tmp2->tmp1... ). */ + tmp2 = state[ i ] + warping * state[ i + 1 ] - warping * tmp1; state[ i ] = tmp1; C[ i ] += state[ 0 ] * tmp1; /* Output of allpass section */ - tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 ); + tmp1 = state[ i + 1 ] + warping * state[ i + 2 ] - warping * tmp2; state[ i + 1 ] = tmp2; C[ i + 1 ] += state[ 0 ] * tmp2; } diff --git a/media/libopus/silk/float/x86/inner_product_FLP_avx2.c b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c new file mode 100644 index 0000000000..4a2daaf595 --- /dev/null +++ b/media/libopus/silk/float/x86/inner_product_FLP_avx2.c @@ -0,0 +1,85 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. + 2023 Amazon +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" +#include + + +/* inner product of two silk_float arrays, with result as double */ +double silk_inner_product_FLP_avx2( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +) +{ + opus_int i; + __m256d accum1, accum2; + double result; + + /* 4x unrolled loop */ + result = 0.0; + accum1 = accum2 = _mm256_setzero_pd(); + for( i = 0; i < dataSize - 7; i += 8 ) { + __m128 x1f, x2f; + __m256d x1d, x2d; + x1f = _mm_loadu_ps( &data1[ i ] ); + x2f = _mm_loadu_ps( &data2[ i ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 ); + x1f = _mm_loadu_ps( &data1[ i + 4 ] ); + x2f = _mm_loadu_ps( &data2[ i + 4 ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum2 = _mm256_fmadd_pd( x1d, x2d, accum2 ); + } + for( ; i < dataSize - 3; i += 4 ) { + __m128 x1f, x2f; + __m256d x1d, x2d; + x1f = _mm_loadu_ps( &data1[ i ] ); + x2f = _mm_loadu_ps( &data2[ i ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 ); + } + accum1 = _mm256_add_pd(accum1, accum2); + accum1 = _mm256_add_pd(accum1, _mm256_permute2f128_pd(accum1, accum1, 1)); + accum1 = _mm256_hadd_pd(accum1,accum1); + result = _mm256_cvtsd_f64(accum1); + + /* add any remaining products */ + for( ; i < dataSize; i++ ) { + result += data1[ i ] * (double)data2[ i ]; + } + + return result; +} -- cgit v1.2.3