diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:13:27 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:13:27 +0000 |
commit | 40a355a42d4a9444dc753c04c6608dade2f06a23 (patch) | |
tree | 871fc667d2de662f171103ce5ec067014ef85e61 /media/libopus/celt/x86 | |
parent | Adding upstream version 124.0.1. (diff) | |
download | firefox-adbda400be353e676059e335c3c0aaf99e719475.tar.xz firefox-adbda400be353e676059e335c3c0aaf99e719475.zip |
Adding upstream version 125.0.1.upstream/125.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media/libopus/celt/x86')
-rw-r--r-- | media/libopus/celt/x86/celt_lpc_sse4_1.c | 13 | ||||
-rw-r--r-- | media/libopus/celt/x86/pitch_avx.c | 101 | ||||
-rw-r--r-- | media/libopus/celt/x86/pitch_sse.h | 40 | ||||
-rw-r--r-- | media/libopus/celt/x86/vq_sse.h | 6 | ||||
-rw-r--r-- | media/libopus/celt/x86/vq_sse2.c | 8 | ||||
-rw-r--r-- | media/libopus/celt/x86/x86_arch_macros.h | 47 | ||||
-rw-r--r-- | media/libopus/celt/x86/x86_celt_map.c | 20 | ||||
-rw-r--r-- | media/libopus/celt/x86/x86cpu.c | 16 | ||||
-rw-r--r-- | media/libopus/celt/x86/x86cpu.h | 49 |
9 files changed, 266 insertions, 34 deletions
diff --git a/media/libopus/celt/x86/celt_lpc_sse4_1.c b/media/libopus/celt/x86/celt_lpc_sse4_1.c index 5478568849..daf59d245a 100644 --- a/media/libopus/celt/x86/celt_lpc_sse4_1.c +++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c @@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x, { opus_val32 sums[4] = {0}; __m128i vecSum, vecX; - - xcorr_kernel(rnum, x+i-ord, sums, ord, arch); - +#if defined(OPUS_CHECK_ASM) + { + opus_val32 sums_c[4] = {0}; + xcorr_kernel_c(rnum, x+i-ord, sums_c, ord); +#endif + xcorr_kernel(rnum, x+i-ord, sums, ord, arch); +#if defined(OPUS_CHECK_ASM) + celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0); + } +#endif vecSum = _mm_loadu_si128((__m128i *)sums); vecSum = _mm_add_epi32(vecSum, vecNoA); vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); diff --git a/media/libopus/celt/x86/pitch_avx.c b/media/libopus/celt/x86/pitch_avx.c new file mode 100644 index 0000000000..f731762d84 --- /dev/null +++ b/media/libopus/celt/x86/pitch_avx.c @@ -0,0 +1,101 @@ +/* Copyright (c) 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#include <immintrin.h> +#include "x86cpu.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT) + +/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */ +static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len) +{ + __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7; + xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps(); + int i; + __m256 x0; + /* Compute 8 inner products using partial sums. */ + for (i=0;i<len-7;i+=8) + { + x0 = _mm256_loadu_ps(x+i); + xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i ), xsum0); + xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1); + xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2); + xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3); + xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4); + xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5); + xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6); + xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7); + } + if (i != len) { + static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + __m256i m; + m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len)); + x0 = _mm256_maskload_ps(x+i, m); + xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i , m), xsum0); + xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1); + xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2); + xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3); + xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4); + xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5); + xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6); + xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7); + } + /* 8 horizontal adds. */ + /* Compute [0 4] [1 5] [2 6] [3 7] */ + xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4))); + xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4))); + xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4))); + xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4))); + /* Compute [0 1 4 5] [2 3 6 7] */ + xsum0 = _mm256_hadd_ps(xsum0, xsum1); + xsum1 = _mm256_hadd_ps(xsum2, xsum3); + /* Compute [0 1 2 3 4 5 6 7] */ + xsum0 = _mm256_hadd_ps(xsum0, xsum1); + _mm256_storeu_ps(sum, xsum0); +} + +void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch) +{ + int i; + celt_assert(max_pitch>0); + (void)arch; + for (i=0;i<max_pitch-7;i+=8) + { + xcorr_kernel_avx(_x, _y+i, &xcorr[i], len); + } + for (;i<max_pitch;i++) + { + xcorr[i] = celt_inner_prod(_x, _y+i, len, arch); + } +} + +#endif diff --git a/media/libopus/celt/x86/pitch_sse.h b/media/libopus/celt/x86/pitch_sse.h index 964aef50db..127581f3e1 100644 --- a/media/libopus/celt/x86/pitch_sse.h +++ b/media/libopus/celt/x86/pitch_sse.h @@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) -#define OVERRIDE_DUAL_INNER_PROD -#define OVERRIDE_COMB_FILTER_CONST - -#undef dual_inner_prod -#undef comb_filter_const - void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, @@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y, #if defined(OPUS_X86_PRESUME_SSE) +#define OVERRIDE_DUAL_INNER_PROD +#define OVERRIDE_COMB_FILTER_CONST # define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) # define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12)) -#else +#elif defined(OPUS_HAVE_RTCD) +#define OVERRIDE_DUAL_INNER_PROD +#define OVERRIDE_COMB_FILTER_CONST extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y01, @@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( #define NON_STATIC_COMB_FILTER_CONST_C #endif -#endif + +void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch); + +#if defined(OPUS_X86_PRESUME_AVX2) + +#define OVERRIDE_PITCH_XCORR +# define celt_pitch_xcorr celt_pitch_xcorr_avx2 + +#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2) + +#define OVERRIDE_PITCH_XCORR +extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])( + const float *_x, + const float *_y, + float *xcorr, + int len, + int max_pitch, + int arch + ); + +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch)) + + +#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */ + +#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */ #endif diff --git a/media/libopus/celt/x86/vq_sse.h b/media/libopus/celt/x86/vq_sse.h index b4efe8f249..444503b630 100644 --- a/media/libopus/celt/x86/vq_sse.h +++ b/media/libopus/celt/x86/vq_sse.h @@ -28,16 +28,18 @@ #define VQ_SSE_H #if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) -#define OVERRIDE_OP_PVQ_SEARCH opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch); #if defined(OPUS_X86_PRESUME_SSE2) + +#define OVERRIDE_OP_PVQ_SEARCH #define op_pvq_search(x, iy, K, N, arch) \ (op_pvq_search_sse2(x, iy, K, N, arch)) -#else +#elif defined(OPUS_HAVE_RTCD) +#define OVERRIDE_OP_PVQ_SEARCH extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( celt_norm *_X, int *iy, int K, int N, int arch); diff --git a/media/libopus/celt/x86/vq_sse2.c b/media/libopus/celt/x86/vq_sse2.c index 775042860d..4c4ebf8e2d 100644 --- a/media/libopus/celt/x86/vq_sse2.c +++ b/media/libopus/celt/x86/vq_sse2.c @@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) sums = _mm_add_ps(sums, x4); /* Clear y and iy in case we don't do the projection. */ _mm_storeu_ps(&y[j], _mm_setzero_ps()); - _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); + _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128()); _mm_storeu_ps(&X[j], x4); _mm_storeu_ps(&signy[j], s4); } @@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) rx4 = _mm_mul_ps(x4, rcp4); iy4 = _mm_cvttps_epi32(rx4); pulses_sum = _mm_add_epi32(pulses_sum, iy4); - _mm_storeu_si128((__m128i*)&iy[j], iy4); + _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4); y4 = _mm_cvtepi32_ps(iy4); xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); @@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) { __m128i y4; __m128i s4; - y4 = _mm_loadu_si128((__m128i*)&iy[j]); + y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]); s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); - _mm_storeu_si128((__m128i*)&iy[j], y4); + _mm_storeu_si128((__m128i*)(void*)&iy[j], y4); } RESTORE_STACK; return yy; diff --git a/media/libopus/celt/x86/x86_arch_macros.h b/media/libopus/celt/x86/x86_arch_macros.h new file mode 100644 index 0000000000..975b443e93 --- /dev/null +++ b/media/libopus/celt/x86/x86_arch_macros.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER + +# ifdef OPUS_X86_MAY_HAVE_SSE +# ifndef __SSE__ +# define __SSE__ +# endif +# endif + +# ifdef OPUS_X86_MAY_HAVE_SSE2 +# ifndef __SSE2__ +# define __SSE2__ +# endif +# endif + +# ifdef OPUS_X86_MAY_HAVE_SSE4_1 +# ifndef __SSE4_1__ +# define __SSE4_1__ +# endif +# endif + +#endif diff --git a/media/libopus/celt/x86/x86_celt_map.c b/media/libopus/celt/x86/x86_celt_map.c index d39d88edec..ba8eafe6ad 100644 --- a/media/libopus/celt/x86/x86_celt_map.c +++ b/media/libopus/celt/x86/x86_celt_map.c @@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( # else +#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2) + +void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])( + const float *_x, + const float *_y, + float *xcorr, + int len, + int max_pitch, + int arch +) = { + celt_pitch_xcorr_c, /* non-sse */ + celt_pitch_xcorr_c, + celt_pitch_xcorr_c, + celt_pitch_xcorr_c, + MAY_HAVE_AVX2(celt_pitch_xcorr) +}; + +#endif + + #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( diff --git a/media/libopus/celt/x86/x86cpu.c b/media/libopus/celt/x86/x86cpu.c index 6a1914dee7..2e7c32aeec 100644 --- a/media/libopus/celt/x86/x86cpu.c +++ b/media/libopus/celt/x86/x86cpu.c @@ -39,7 +39,7 @@ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) #if defined(_MSC_VER) @@ -105,7 +105,7 @@ typedef struct CPU_Feature{ int HW_SSE2; int HW_SSE41; /* SIMD: 256-bit */ - int HW_AVX; + int HW_AVX2; } CPU_Feature; static void opus_cpu_feature_check(CPU_Feature *cpu_feature) @@ -121,13 +121,19 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0; cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; - cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0; + cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0; + if (cpu_feature->HW_AVX2 && nIds >= 7) { + cpuid(info, 7); + cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0; + } else { + cpu_feature->HW_AVX2 = 0; + } } else { cpu_feature->HW_SSE = 0; cpu_feature->HW_SSE2 = 0; cpu_feature->HW_SSE41 = 0; - cpu_feature->HW_AVX = 0; + cpu_feature->HW_AVX2 = 0; } } @@ -157,7 +163,7 @@ static int opus_select_arch_impl(void) } arch++; - if (!cpu_feature.HW_AVX) + if (!cpu_feature.HW_AVX2) { return arch; } diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h index 04e80489b1..8ae9be8d8f 100644 --- a/media/libopus/celt/x86/x86cpu.h +++ b/media/libopus/celt/x86/x86cpu.h @@ -46,28 +46,53 @@ # define MAY_HAVE_SSE4_1(name) name ## _c # endif -# if defined(OPUS_X86_MAY_HAVE_AVX) -# define MAY_HAVE_AVX(name) name ## _avx +# if defined(OPUS_X86_MAY_HAVE_AVX2) +# define MAY_HAVE_AVX2(name) name ## _avx2 # else -# define MAY_HAVE_AVX(name) name ## _c +# define MAY_HAVE_AVX2(name) name ## _c # endif -# if defined(OPUS_HAVE_RTCD) +# if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) int opus_select_arch(void); # endif +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# include "opus_defines.h" + /*MOVD should not impose any alignment restrictions, but the C standard does, and UBSan will report errors if we actually make unaligned accesses. Use this to work around those restrictions (which should hopefully all get - optimized to a single MOVD instruction).*/ -#define OP_LOADU_EPI32(x) \ - (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\ - *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U)) + optimized to a single MOVD instruction). + GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug! + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */ +# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8)) +# include <string.h> +# include <emmintrin.h> + +# ifdef _mm_loadu_si32 +# undef _mm_loadu_si32 +# endif +# define _mm_loadu_si32 WORKAROUND_mm_loadu_si32 +static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) { + int val; + memcpy(&val, mem_addr, sizeof(val)); + return _mm_cvtsi32_si128(val); +} +# elif defined(_MSC_VER) + /* MSVC needs this for _mm_loadu_si32 */ +# include <immintrin.h> +# endif -#define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x)))) +# define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_loadu_si32(x))) -#define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x)))) + +# endif #endif |