summaryrefslogtreecommitdiffstats
path: root/media/libopus/celt/x86
diff options
context:
space:
mode:
Diffstat (limited to 'media/libopus/celt/x86')
-rw-r--r--media/libopus/celt/x86/celt_lpc_sse4_1.c13
-rw-r--r--media/libopus/celt/x86/pitch_avx.c101
-rw-r--r--media/libopus/celt/x86/pitch_sse.h40
-rw-r--r--media/libopus/celt/x86/vq_sse.h6
-rw-r--r--media/libopus/celt/x86/vq_sse2.c8
-rw-r--r--media/libopus/celt/x86/x86_arch_macros.h47
-rw-r--r--media/libopus/celt/x86/x86_celt_map.c20
-rw-r--r--media/libopus/celt/x86/x86cpu.c16
-rw-r--r--media/libopus/celt/x86/x86cpu.h49
9 files changed, 266 insertions, 34 deletions
diff --git a/media/libopus/celt/x86/celt_lpc_sse4_1.c b/media/libopus/celt/x86/celt_lpc_sse4_1.c
index 5478568849..daf59d245a 100644
--- a/media/libopus/celt/x86/celt_lpc_sse4_1.c
+++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c
@@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x,
{
opus_val32 sums[4] = {0};
__m128i vecSum, vecX;
-
- xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
-
+#if defined(OPUS_CHECK_ASM)
+ {
+ opus_val32 sums_c[4] = {0};
+ xcorr_kernel_c(rnum, x+i-ord, sums_c, ord);
+#endif
+ xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+#if defined(OPUS_CHECK_ASM)
+ celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0);
+ }
+#endif
vecSum = _mm_loadu_si128((__m128i *)sums);
vecSum = _mm_add_epi32(vecSum, vecNoA);
vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
diff --git a/media/libopus/celt/x86/pitch_avx.c b/media/libopus/celt/x86/pitch_avx.c
new file mode 100644
index 0000000000..f731762d84
--- /dev/null
+++ b/media/libopus/celt/x86/pitch_avx.c
@@ -0,0 +1,101 @@
+/* Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <immintrin.h>
+#include "x86cpu.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT)
+
+/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
+static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
+{
+ __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
+ xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
+ int i;
+ __m256 x0;
+ /* Compute 8 inner products using partial sums. */
+ for (i=0;i<len-7;i+=8)
+ {
+ x0 = _mm256_loadu_ps(x+i);
+ xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i ), xsum0);
+ xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
+ xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
+ xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
+ xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
+ xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
+ xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
+ xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
+ }
+ if (i != len) {
+ static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
+ __m256i m;
+ m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len));
+ x0 = _mm256_maskload_ps(x+i, m);
+ xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i , m), xsum0);
+ xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
+ xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
+ xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
+ xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
+ xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
+ xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
+ xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
+ }
+ /* 8 horizontal adds. */
+ /* Compute [0 4] [1 5] [2 6] [3 7] */
+ xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
+ xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
+ xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
+ xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
+ /* Compute [0 1 4 5] [2 3 6 7] */
+ xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+ xsum1 = _mm256_hadd_ps(xsum2, xsum3);
+ /* Compute [0 1 2 3 4 5 6 7] */
+ xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+ _mm256_storeu_ps(sum, xsum0);
+}
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
+{
+ int i;
+ celt_assert(max_pitch>0);
+ (void)arch;
+ for (i=0;i<max_pitch-7;i+=8)
+ {
+ xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
+ }
+ for (;i<max_pitch;i++)
+ {
+ xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
+ }
+}
+
+#endif
diff --git a/media/libopus/celt/x86/pitch_sse.h b/media/libopus/celt/x86/pitch_sse.h
index 964aef50db..127581f3e1 100644
--- a/media/libopus/celt/x86/pitch_sse.h
+++ b/media/libopus/celt/x86/pitch_sse.h
@@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
-#define OVERRIDE_DUAL_INNER_PROD
-#define OVERRIDE_COMB_FILTER_CONST
-
-#undef dual_inner_prod
-#undef comb_filter_const
-
void dual_inner_prod_sse(const opus_val16 *x,
const opus_val16 *y01,
const opus_val16 *y02,
@@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y,
#if defined(OPUS_X86_PRESUME_SSE)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
-#else
+#elif defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y01,
@@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
#define NON_STATIC_COMB_FILTER_CONST_C
#endif
-#endif
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch);
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_avx2
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+ const float *_x,
+ const float *_y,
+ float *xcorr,
+ int len,
+ int max_pitch,
+ int arch
+ );
+
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch))
+
+
+#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */
+
+#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */
#endif
diff --git a/media/libopus/celt/x86/vq_sse.h b/media/libopus/celt/x86/vq_sse.h
index b4efe8f249..444503b630 100644
--- a/media/libopus/celt/x86/vq_sse.h
+++ b/media/libopus/celt/x86/vq_sse.h
@@ -28,16 +28,18 @@
#define VQ_SSE_H
#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
-#define OVERRIDE_OP_PVQ_SEARCH
opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
#if defined(OPUS_X86_PRESUME_SSE2)
+
+#define OVERRIDE_OP_PVQ_SEARCH
#define op_pvq_search(x, iy, K, N, arch) \
(op_pvq_search_sse2(x, iy, K, N, arch))
-#else
+#elif defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_OP_PVQ_SEARCH
extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
celt_norm *_X, int *iy, int K, int N, int arch);
diff --git a/media/libopus/celt/x86/vq_sse2.c b/media/libopus/celt/x86/vq_sse2.c
index 775042860d..4c4ebf8e2d 100644
--- a/media/libopus/celt/x86/vq_sse2.c
+++ b/media/libopus/celt/x86/vq_sse2.c
@@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
sums = _mm_add_ps(sums, x4);
/* Clear y and iy in case we don't do the projection. */
_mm_storeu_ps(&y[j], _mm_setzero_ps());
- _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+ _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
_mm_storeu_ps(&X[j], x4);
_mm_storeu_ps(&signy[j], s4);
}
@@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
rx4 = _mm_mul_ps(x4, rcp4);
iy4 = _mm_cvttps_epi32(rx4);
pulses_sum = _mm_add_epi32(pulses_sum, iy4);
- _mm_storeu_si128((__m128i*)&iy[j], iy4);
+ _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
y4 = _mm_cvtepi32_ps(iy4);
xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
@@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
{
__m128i y4;
__m128i s4;
- y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+ y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
- _mm_storeu_si128((__m128i*)&iy[j], y4);
+ _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
}
RESTORE_STACK;
return yy;
diff --git a/media/libopus/celt/x86/x86_arch_macros.h b/media/libopus/celt/x86/x86_arch_macros.h
new file mode 100644
index 0000000000..975b443e93
--- /dev/null
+++ b/media/libopus/celt/x86/x86_arch_macros.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+# ifndef __SSE__
+# define __SSE__
+# endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE2
+# ifndef __SSE2__
+# define __SSE2__
+# endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE4_1
+# ifndef __SSE4_1__
+# define __SSE4_1__
+# endif
+# endif
+
+#endif
diff --git a/media/libopus/celt/x86/x86_celt_map.c b/media/libopus/celt/x86/x86_celt_map.c
index d39d88edec..ba8eafe6ad 100644
--- a/media/libopus/celt/x86/x86_celt_map.c
+++ b/media/libopus/celt/x86/x86_celt_map.c
@@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
# else
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)
+
+void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+ const float *_x,
+ const float *_y,
+ float *xcorr,
+ int len,
+ int max_pitch,
+ int arch
+) = {
+ celt_pitch_xcorr_c, /* non-sse */
+ celt_pitch_xcorr_c,
+ celt_pitch_xcorr_c,
+ celt_pitch_xcorr_c,
+ MAY_HAVE_AVX2(celt_pitch_xcorr)
+};
+
+#endif
+
+
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
diff --git a/media/libopus/celt/x86/x86cpu.c b/media/libopus/celt/x86/x86cpu.c
index 6a1914dee7..2e7c32aeec 100644
--- a/media/libopus/celt/x86/x86cpu.c
+++ b/media/libopus/celt/x86/x86cpu.c
@@ -39,7 +39,7 @@
((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
+ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
#if defined(_MSC_VER)
@@ -105,7 +105,7 @@ typedef struct CPU_Feature{
int HW_SSE2;
int HW_SSE41;
/* SIMD: 256-bit */
- int HW_AVX;
+ int HW_AVX2;
} CPU_Feature;
static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
@@ -121,13 +121,19 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
- cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+ cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0;
+ if (cpu_feature->HW_AVX2 && nIds >= 7) {
+ cpuid(info, 7);
+ cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0;
+ } else {
+ cpu_feature->HW_AVX2 = 0;
+ }
}
else {
cpu_feature->HW_SSE = 0;
cpu_feature->HW_SSE2 = 0;
cpu_feature->HW_SSE41 = 0;
- cpu_feature->HW_AVX = 0;
+ cpu_feature->HW_AVX2 = 0;
}
}
@@ -157,7 +163,7 @@ static int opus_select_arch_impl(void)
}
arch++;
- if (!cpu_feature.HW_AVX)
+ if (!cpu_feature.HW_AVX2)
{
return arch;
}
diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h
index 04e80489b1..8ae9be8d8f 100644
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@@ -46,28 +46,53 @@
# define MAY_HAVE_SSE4_1(name) name ## _c
# endif
-# if defined(OPUS_X86_MAY_HAVE_AVX)
-# define MAY_HAVE_AVX(name) name ## _avx
+# if defined(OPUS_X86_MAY_HAVE_AVX2)
+# define MAY_HAVE_AVX2(name) name ## _avx2
# else
-# define MAY_HAVE_AVX(name) name ## _c
+# define MAY_HAVE_AVX2(name) name ## _c
# endif
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
int opus_select_arch(void);
# endif
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+# include "opus_defines.h"
+
/*MOVD should not impose any alignment restrictions, but the C standard does,
and UBSan will report errors if we actually make unaligned accesses.
Use this to work around those restrictions (which should hopefully all get
- optimized to a single MOVD instruction).*/
-#define OP_LOADU_EPI32(x) \
- (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
- *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+ optimized to a single MOVD instruction).
+ GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
+# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+# include <string.h>
+# include <emmintrin.h>
+
+# ifdef _mm_loadu_si32
+# undef _mm_loadu_si32
+# endif
+# define _mm_loadu_si32 WORKAROUND_mm_loadu_si32
+static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) {
+ int val;
+ memcpy(&val, mem_addr, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+# elif defined(_MSC_VER)
+ /* MSVC needs this for _mm_loadu_si32 */
+# include <immintrin.h>
+# endif
-#define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+# define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_loadu_si32(x)))
-#define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x))))
+
+# endif
#endif