9 files changed, 266 insertions, 34 deletions
diff --git a/media/libopus/celt/x86/celt_lpc_sse4_1.c b/media/libopus/celt/x86/celt_lpc_sse4_1.c
index 5478568849..daf59d245a 100644
--- a/media/libopus/celt/x86/celt_lpc_sse4_1.c
+++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c
@@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x,
    {
       opus_val32 sums[4] = {0};
       __m128i vecSum, vecX;
-
-      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
-
+#if defined(OPUS_CHECK_ASM)
+      {
+         opus_val32 sums_c[4] = {0};
+         xcorr_kernel_c(rnum, x+i-ord, sums_c, ord);
+#endif
+         xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+#if defined(OPUS_CHECK_ASM)
+         celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0);
+      }
+#endif
       vecSum = _mm_loadu_si128((__m128i *)sums);
       vecSum = _mm_add_epi32(vecSum, vecNoA);
       vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
diff --git a/media/libopus/celt/x86/pitch_avx.c b/media/libopus/celt/x86/pitch_avx.c
new file mode 100644
index 0000000000..f731762d84
--- /dev/null
+++ b/media/libopus/celt/x86/pitch_avx.c
@@ -0,0 +1,101 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <immintrin.h>
+#include "x86cpu.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT)
+
+/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
+static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
+{
+    __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
+    xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
+    int i;
+    __m256 x0;
+    /* Compute 8 inner products using partial sums. */
+    for (i=0;i<len-7;i+=8)
+    {
+        x0 = _mm256_loadu_ps(x+i);
+        xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i  ), xsum0);
+        xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
+        xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
+        xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
+        xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
+        xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
+        xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
+        xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
+    }
+    if (i != len) {
+        static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
+        __m256i m;
+        m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len));
+        x0 = _mm256_maskload_ps(x+i, m);
+        xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i  , m), xsum0);
+        xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
+        xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
+        xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
+        xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
+        xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
+        xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
+        xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
+    }
+    /* 8 horizontal adds. */
+    /* Compute [0 4] [1 5] [2 6] [3 7] */
+    xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
+    xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
+    xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
+    xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
+    /* Compute [0 1 4 5] [2 3 6 7] */
+    xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+    xsum1 = _mm256_hadd_ps(xsum2, xsum3);
+    /* Compute [0 1 2 3 4 5 6 7] */
+    xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+    _mm256_storeu_ps(sum, xsum0);
+}
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
+{
+   int i;
+   celt_assert(max_pitch>0);
+   (void)arch;
+   for (i=0;i<max_pitch-7;i+=8)
+   {
+      xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
+   }
+   for (;i<max_pitch;i++)
+   {
+      xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
+   }
+}
+
+#endif
diff --git a/media/libopus/celt/x86/pitch_sse.h b/media/libopus/celt/x86/pitch_sse.h
index 964aef50db..127581f3e1 100644
--- a/media/libopus/celt/x86/pitch_sse.h
+++ b/media/libopus/celt/x86/pitch_sse.h
@@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 
 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 
-#define OVERRIDE_DUAL_INNER_PROD
-#define OVERRIDE_COMB_FILTER_CONST
-
-#undef dual_inner_prod
-#undef comb_filter_const
-
 void dual_inner_prod_sse(const opus_val16 *x,
     const opus_val16 *y01,
     const opus_val16 *y02,
@@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y,
 
 
 #if defined(OPUS_X86_PRESUME_SSE)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
 # define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
     ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
 
 # define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
     ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
 extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
               const opus_val16 *x,
               const opus_val16 *y01,
@@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
 #define NON_STATIC_COMB_FILTER_CONST_C
 
 #endif
-#endif
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch);
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_avx2
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+              const float *_x,
+              const float *_y,
+              float *xcorr,
+              int len,
+              int max_pitch,
+              int arch
+              );
+
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+    ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch))
+
+
+#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */
+
+#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */
 
 #endif
diff --git a/media/libopus/celt/x86/vq_sse.h b/media/libopus/celt/x86/vq_sse.h
index b4efe8f249..444503b630 100644
--- a/media/libopus/celt/x86/vq_sse.h
+++ b/media/libopus/celt/x86/vq_sse.h
@@ -28,16 +28,18 @@
 #define VQ_SSE_H
 
 #if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
-#define OVERRIDE_OP_PVQ_SEARCH
 
 opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE2)
+
+#define OVERRIDE_OP_PVQ_SEARCH
 #define op_pvq_search(x, iy, K, N, arch) \
     (op_pvq_search_sse2(x, iy, K, N, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
+#define OVERRIDE_OP_PVQ_SEARCH
 extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
       celt_norm *_X, int *iy, int K, int N, int arch);
 
diff --git a/media/libopus/celt/x86/vq_sse2.c b/media/libopus/celt/x86/vq_sse2.c
index 775042860d..4c4ebf8e2d 100644
--- a/media/libopus/celt/x86/vq_sse2.c
+++ b/media/libopus/celt/x86/vq_sse2.c
@@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
       sums = _mm_add_ps(sums, x4);
       /* Clear y and iy in case we don't do the projection. */
       _mm_storeu_ps(&y[j], _mm_setzero_ps());
-      _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+      _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
       _mm_storeu_ps(&X[j], x4);
       _mm_storeu_ps(&signy[j], s4);
    }
@@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
          rx4 = _mm_mul_ps(x4, rcp4);
          iy4 = _mm_cvttps_epi32(rx4);
          pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-         _mm_storeu_si128((__m128i*)&iy[j], iy4);
+         _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
          y4 = _mm_cvtepi32_ps(iy4);
          xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
          yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
@@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
    {
       __m128i y4;
       __m128i s4;
-      y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+      y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
       s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
       y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
-      _mm_storeu_si128((__m128i*)&iy[j], y4);
+      _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
    }
    RESTORE_STACK;
    return yy;
diff --git a/media/libopus/celt/x86/x86_arch_macros.h b/media/libopus/celt/x86/x86_arch_macros.h
new file mode 100644
index 0000000000..975b443e93
--- /dev/null
+++ b/media/libopus/celt/x86/x86_arch_macros.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+#  ifndef __SSE__
+#   define __SSE__
+#  endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE2
+#  ifndef __SSE2__
+#   define __SSE2__
+#  endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE4_1
+#  ifndef __SSE4_1__
+#   define __SSE4_1__
+#  endif
+# endif
+
+#endif
diff --git a/media/libopus/celt/x86/x86_celt_map.c b/media/libopus/celt/x86/x86_celt_map.c
index d39d88edec..ba8eafe6ad 100644
--- a/media/libopus/celt/x86/x86_celt_map.c
+++ b/media/libopus/celt/x86/x86_celt_map.c
@@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 
 # else
 
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)
+
+void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+         const float *_x,
+         const float *_y,
+         float *xcorr,
+         int len,
+         int max_pitch,
+         int arch
+) = {
+  celt_pitch_xcorr_c,                /* non-sse */
+  celt_pitch_xcorr_c,
+  celt_pitch_xcorr_c,
+  celt_pitch_xcorr_c,
+  MAY_HAVE_AVX2(celt_pitch_xcorr)
+};
+
+#endif
+
+
 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
 
 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
diff --git a/media/libopus/celt/x86/x86cpu.c b/media/libopus/celt/x86/x86cpu.c
index 6a1914dee7..2e7c32aeec 100644
--- a/media/libopus/celt/x86/x86cpu.c
+++ b/media/libopus/celt/x86/x86cpu.c
@@ -39,7 +39,7 @@
   ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 
 #if defined(_MSC_VER)
 
@@ -105,7 +105,7 @@ typedef struct CPU_Feature{
     int HW_SSE2;
     int HW_SSE41;
     /*  SIMD: 256-bit */
-    int HW_AVX;
+    int HW_AVX2;
 } CPU_Feature;
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
@@ -121,13 +121,19 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
         cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
-        cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+        cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0;
+        if (cpu_feature->HW_AVX2 && nIds >= 7) {
+            cpuid(info, 7);
+            cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0;
+        } else {
+            cpu_feature->HW_AVX2 = 0;
+        }
     }
     else {
         cpu_feature->HW_SSE = 0;
         cpu_feature->HW_SSE2 = 0;
         cpu_feature->HW_SSE41 = 0;
-        cpu_feature->HW_AVX = 0;
+        cpu_feature->HW_AVX2 = 0;
     }
 }
 
@@ -157,7 +163,7 @@ static int opus_select_arch_impl(void)
     }
     arch++;
 
-    if (!cpu_feature.HW_AVX)
+    if (!cpu_feature.HW_AVX2)
     {
         return arch;
     }
diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h
index 04e80489b1..8ae9be8d8f 100644
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@@ -46,28 +46,53 @@
 #  define MAY_HAVE_SSE4_1(name) name ## _c
 # endif
 
-# if defined(OPUS_X86_MAY_HAVE_AVX)
-#  define MAY_HAVE_AVX(name) name ## _avx
+# if defined(OPUS_X86_MAY_HAVE_AVX2)
+#  define MAY_HAVE_AVX2(name) name ## _avx2
 # else
-#  define MAY_HAVE_AVX(name) name ## _c
+#  define MAY_HAVE_AVX2(name) name ## _c
 # endif
 
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 int opus_select_arch(void);
 # endif
 
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  include "opus_defines.h"
+
 /*MOVD should not impose any alignment restrictions, but the C standard does,
    and UBSan will report errors if we actually make unaligned accesses.
   Use this to work around those restrictions (which should hopefully all get
-   optimized to a single MOVD instruction).*/
-#define OP_LOADU_EPI32(x) \
-  (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
-   *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+   optimized to a single MOVD instruction).
+  GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
+#  if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+#   include <string.h>
+#   include <emmintrin.h>
+
+#   ifdef _mm_loadu_si32
+#    undef _mm_loadu_si32
+#   endif
+#   define _mm_loadu_si32 WORKAROUND_mm_loadu_si32
+static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) {
+  int val;
+  memcpy(&val, mem_addr, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+#  elif defined(_MSC_VER)
+    /* MSVC needs this for _mm_loadu_si32 */
+#   include <immintrin.h>
+#  endif
 
-#define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+#  define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_loadu_si32(x)))
 
-#define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x))))
+
+# endif
 
 #endif