diff options
Diffstat (limited to 'test cases/common/147 simd')
-rw-r--r-- | test cases/common/147 simd/fallback.c | 8 | ||||
-rw-r--r-- | test cases/common/147 simd/include/simdheader.h | 3 | ||||
-rw-r--r-- | test cases/common/147 simd/meson.build | 43 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_avx.c | 49 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_avx2.c | 42 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_mmx.c | 67 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_neon.c | 20 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_sse.c | 29 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_sse2.c | 36 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_sse3.c | 38 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_sse41.c | 40 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_sse42.c | 43 | ||||
-rw-r--r-- | test cases/common/147 simd/simd_ssse3.c | 48 | ||||
-rw-r--r-- | test cases/common/147 simd/simdchecker.c | 143 | ||||
-rw-r--r-- | test cases/common/147 simd/simdfuncs.h | 75 |
15 files changed, 684 insertions, 0 deletions
diff --git a/test cases/common/147 simd/fallback.c b/test cases/common/147 simd/fallback.c new file mode 100644 index 0000000..ab435f4 --- /dev/null +++ b/test cases/common/147 simd/fallback.c @@ -0,0 +1,8 @@ +#include<simdfuncs.h> + +void increment_fallback(float arr[4]) { + int i; + for(i=0; i<4; i++) { + arr[i]++; + } +} diff --git a/test cases/common/147 simd/include/simdheader.h b/test cases/common/147 simd/include/simdheader.h new file mode 100644 index 0000000..6515e41 --- /dev/null +++ b/test cases/common/147 simd/include/simdheader.h @@ -0,0 +1,3 @@ +#pragma once + +#define I_CAN_HAZ_SIMD diff --git a/test cases/common/147 simd/meson.build b/test cases/common/147 simd/meson.build new file mode 100644 index 0000000..101fdb8 --- /dev/null +++ b/test cases/common/147 simd/meson.build @@ -0,0 +1,43 @@ +project('simd', 'c') + +simd = import('unstable-simd') + +cc = meson.get_compiler('c') + +cdata = configuration_data() + +if not meson.is_cross_build() and host_machine.cpu_family() == 'arm' and cc.get_id() == 'clang' + message('Adding -march=armv7 because assuming that this build happens on Raspbian.') + message('Its Clang seems to be misconfigured and does not support NEON by default.') + add_project_arguments('-march=armv7', language : 'c') +endif + +if cc.get_id() == 'msvc' and cc.version().version_compare('<17') + error('MESON_SKIP_TEST VS2010 produces broken binaries on x86.') +endif + +# FIXME add [a, b] = function() +rval = simd.check('mysimds', + mmx : 'simd_mmx.c', + sse : 'simd_sse.c', + sse2 : 'simd_sse2.c', + sse3 : 'simd_sse3.c', + ssse3 : 'simd_ssse3.c', + sse41 : 'simd_sse41.c', + sse42 : 'simd_sse42.c', + avx : 'simd_avx.c', + avx2 : 'simd_avx2.c', + neon : 'simd_neon.c', + compiler : cc, + include_directories : include_directories('include')) + +simdlibs = rval[0] +cdata.merge_from(rval[1]) + +configure_file(output : 'simdconfig.h', + configuration : cdata) + +p = executable('simdtest', 'simdchecker.c', 'fallback.c', + link_with : simdlibs) + +test('simdtest', p) diff --git a/test cases/common/147 simd/simd_avx.c b/test cases/common/147 simd/simd_avx.c new file mode 100644 index 0000000..5f45a4e --- /dev/null +++ b/test cases/common/147 simd/simd_avx.c @@ -0,0 +1,49 @@ +#include<simdheader.h> + +#ifndef I_CAN_HAZ_SIMD +#error The correct internal header was not used +#endif + +#include<simdconfig.h> +#include<simdfuncs.h> +#include<stdint.h> + +#ifdef _MSC_VER +#include<intrin.h> +int avx_available(void) { + return 1; +} +#else +#include<immintrin.h> +#include<cpuid.h> + +#ifdef __APPLE__ +/* + * Apple ships a broken __builtin_cpu_supports and + * some machines in the CI farm seem to be too + * old to have AVX so just always return 0 here. + */ +int avx_available(void) { return 0; } +#else + +int avx_available(void) { + return __builtin_cpu_supports("avx"); +} +#endif +#endif + +void increment_avx(float arr[4]) { + double darr[4]; + darr[0] = arr[0]; + darr[1] = arr[1]; + darr[2] = arr[2]; + darr[3] = arr[3]; + __m256d val = _mm256_loadu_pd(darr); + __m256d one = _mm256_set1_pd(1.0); + __m256d result = _mm256_add_pd(val, one); + _mm256_storeu_pd(darr, result); + arr[0] = (float)darr[0]; + arr[1] = (float)darr[1]; + arr[2] = (float)darr[2]; + arr[3] = (float)darr[3]; +} diff --git a/test cases/common/147 simd/simd_avx2.c b/test cases/common/147 simd/simd_avx2.c new file mode 100644 index 0000000..c79819b --- /dev/null +++ b/test cases/common/147 simd/simd_avx2.c @@ -0,0 +1,42 @@ +#include<simdconfig.h> +#include<simdfuncs.h> +#include<stdint.h> + +/* + * FIXME add proper runtime detection for VS. + */ + +#ifdef _MSC_VER +#include<intrin.h> +int avx2_available(void) { + return 0; +} +#else +#include<immintrin.h> +#include<cpuid.h> + +#if defined(__APPLE__) +int avx2_available(void) { return 0; } +#else +int avx2_available(void) { + return __builtin_cpu_supports("avx2"); +} +#endif +#endif + +void increment_avx2(float arr[4]) { + double darr[4]; + darr[0] = arr[0]; + darr[1] = arr[1]; + darr[2] = arr[2]; + darr[3] = arr[3]; + __m256d val = _mm256_loadu_pd(darr); + __m256d one = _mm256_set1_pd(1.0); + __m256d result = _mm256_add_pd(val, one); + _mm256_storeu_pd(darr, result); + one = _mm256_permute4x64_pd(one, 66); /* A no-op, just here to use AVX2. */ + arr[0] = (float)darr[0]; + arr[1] = (float)darr[1]; + arr[2] = (float)darr[2]; + arr[3] = (float)darr[3]; +} diff --git a/test cases/common/147 simd/simd_mmx.c b/test cases/common/147 simd/simd_mmx.c new file mode 100644 index 0000000..7605442 --- /dev/null +++ b/test cases/common/147 simd/simd_mmx.c @@ -0,0 +1,67 @@ +#include<simdconfig.h> +#include<simdfuncs.h> + +#include<stdint.h> + +#ifdef _MSC_VER +#include<intrin.h> +int mmx_available(void) { + return 1; +} +/* Contrary to MSDN documentation, MMX intrinsics + * just plain don't work. + */ +void increment_mmx(float arr[4]) { + arr[0]++; + arr[1]++; + arr[2]++; + arr[3]++; +} +#elif defined(__MINGW32__) +int mmx_available(void) { + return 1; +} +/* MinGW does not seem to ship with MMX or it is broken. + */ +void increment_mmx(float arr[4]) { + arr[0]++; + arr[1]++; + arr[2]++; + arr[3]++; +} +#else +#include<mmintrin.h> +#include<cpuid.h> + +#if defined(__APPLE__) +int mmx_available(void) { return 1; } +#else +int mmx_available(void) { + return __builtin_cpu_supports("mmx"); +} +#endif +void increment_mmx(float arr[4]) { + /* Super ugly but we know that values in arr are always small + * enough to fit in int16; + */ + int i; + __m64 packed = _mm_set_pi16(arr[3], arr[2], arr[1], arr[0]); + __m64 incr = _mm_set1_pi16(1); + __m64 result = _mm_add_pi16(packed, incr); + /* Should be + * int64_t unpacker = _m_to_int64(result); + * but it does not exist on 32 bit platforms for some reason. + */ + int64_t unpacker = (int64_t)(result); + _mm_empty(); + for(i=0; i<4; i++) { + /* This fails on GCC 8 when optimizations are enabled. + * Disable it. Patches welcome to fix this. + arr[i] = (float)(unpacker & ((1<<16)-1)); + unpacker >>= 16; + */ + arr[i] += 1.0f; + } +} + +#endif diff --git a/test cases/common/147 simd/simd_neon.c b/test cases/common/147 simd/simd_neon.c new file mode 100644 index 0000000..2834b30 --- /dev/null +++ b/test cases/common/147 simd/simd_neon.c @@ -0,0 +1,20 @@ +#include<simdconfig.h> +#include<simdfuncs.h> + +#include<arm_neon.h> +#include<stdint.h> + +int neon_available(void) { + return 1; /* Incorrect, but I don't know how to check this properly. */ +} + +void increment_neon(float arr[4]) { + float32x2_t a1, a2, one; + a1 = vld1_f32(arr); + a2 = vld1_f32(&arr[2]); + one = vdup_n_f32(1.0); + a1 = vadd_f32(a1, one); + a2 = vadd_f32(a2, one); + vst1_f32(arr, a1); + vst1_f32(&arr[2], a2); +} diff --git a/test cases/common/147 simd/simd_sse.c b/test cases/common/147 simd/simd_sse.c new file mode 100644 index 0000000..6014e0c --- /dev/null +++ b/test cases/common/147 simd/simd_sse.c @@ -0,0 +1,29 @@ +#include<simdconfig.h> +#include<simdfuncs.h> + +#ifdef _MSC_VER +#include<intrin.h> +int sse_available(void) { + return 1; +} +#else + +#include<xmmintrin.h> +#include<cpuid.h> +#include<stdint.h> + +#if defined(__APPLE__) +int sse_available(void) { return 1; } +#else +int sse_available(void) { + return __builtin_cpu_supports("sse"); +} +#endif +#endif + +void increment_sse(float arr[4]) { + __m128 val = _mm_load_ps(arr); + __m128 one = _mm_set_ps1(1.0); + __m128 result = _mm_add_ps(val, one); + _mm_storeu_ps(arr, result); +} diff --git a/test cases/common/147 simd/simd_sse2.c b/test cases/common/147 simd/simd_sse2.c new file mode 100644 index 0000000..3a57a5b --- /dev/null +++ b/test cases/common/147 simd/simd_sse2.c @@ -0,0 +1,36 @@ +#include<simdconfig.h> +#include<simdfuncs.h> +#include<emmintrin.h> + +#ifdef _MSC_VER +int sse2_available(void) { + return 1; +} + +#else +#include<cpuid.h> +#include<stdint.h> + +#if defined(__APPLE__) +int sse2_available(void) { return 1; } +#else +int sse2_available(void) { + return __builtin_cpu_supports("sse2"); +} +#endif +#endif + +void increment_sse2(float arr[4]) { + ALIGN_16 double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/147 simd/simd_sse3.c b/test cases/common/147 simd/simd_sse3.c new file mode 100644 index 0000000..29a35e6 --- /dev/null +++ b/test cases/common/147 simd/simd_sse3.c @@ -0,0 +1,38 @@ +#include<simdconfig.h> +#include<simdfuncs.h> + +#ifdef _MSC_VER +#include<intrin.h> +int sse3_available(void) { + return 1; +} +#else + +#include<pmmintrin.h> +#include<cpuid.h> +#include<stdint.h> + +#if defined(__APPLE__) +int sse3_available(void) { return 1; } +#else +int sse3_available(void) { + return __builtin_cpu_supports("sse3"); +} +#endif +#endif + +void increment_sse3(float arr[4]) { + ALIGN_16 double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + result = _mm_hadd_pd(val1, val2); /* This does nothing. Only here so we use an SSE3 instruction. */ + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/147 simd/simd_sse41.c b/test cases/common/147 simd/simd_sse41.c new file mode 100644 index 0000000..29f2555 --- /dev/null +++ b/test cases/common/147 simd/simd_sse41.c @@ -0,0 +1,40 @@ +#include<simdconfig.h> +#include<simdfuncs.h> + +#include<stdint.h> + +#ifdef _MSC_VER +#include<intrin.h> + +int sse41_available(void) { + return 1; +} + +#else +#include<smmintrin.h> +#include<cpuid.h> + +#if defined(__APPLE__) +int sse41_available(void) { return 1; } +#else +int sse41_available(void) { + return __builtin_cpu_supports("sse4.1"); +} +#endif +#endif + +void increment_sse41(float arr[4]) { + ALIGN_16 double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + result = _mm_ceil_pd(result); /* A no-op, only here to use a SSE4.1 intrinsic. */ + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/147 simd/simd_sse42.c b/test cases/common/147 simd/simd_sse42.c new file mode 100644 index 0000000..f1564e2 --- /dev/null +++ b/test cases/common/147 simd/simd_sse42.c @@ -0,0 +1,43 @@ +#include<simdconfig.h> +#include<simdfuncs.h> +#include<stdint.h> + +#ifdef _MSC_VER +#include<intrin.h> + +int sse42_available(void) { + return 1; +} + +#else + +#include<nmmintrin.h> +#include<cpuid.h> + +#ifdef __APPLE__ +int sse42_available(void) { + return 1; +} +#else +int sse42_available(void) { + return __builtin_cpu_supports("sse4.2"); +} +#endif + +#endif + +void increment_sse42(float arr[4]) { + ALIGN_16 double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */ + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/147 simd/simd_ssse3.c b/test cases/common/147 simd/simd_ssse3.c new file mode 100644 index 0000000..fa557f4 --- /dev/null +++ b/test cases/common/147 simd/simd_ssse3.c @@ -0,0 +1,48 @@ +#include<simdconfig.h> +#include<simdfuncs.h> + +#include<emmintrin.h> +#include<tmmintrin.h> + +#ifdef _MSC_VER +#include<intrin.h> + +int ssse3_available(void) { + return 1; +} + +#else + +#include<cpuid.h> +#include<stdint.h> + +int ssse3_available(void) { +#ifdef __APPLE__ + return 1; +#elif defined(__clang__) + /* https://github.com/numpy/numpy/issues/8130 */ + return __builtin_cpu_supports("sse4.1"); +#else + return __builtin_cpu_supports("ssse3"); +#endif +} + +#endif + +void increment_ssse3(float arr[4]) { + ALIGN_16 double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + __m128i tmp1, tmp2; + tmp1 = tmp2 = _mm_set1_epi16(0); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + tmp1 = _mm_hadd_epi32(tmp1, tmp2); /* This does nothing. Only here so we use an SSSE3 instruction. */ + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/147 simd/simdchecker.c b/test cases/common/147 simd/simdchecker.c new file mode 100644 index 0000000..c7a0a97 --- /dev/null +++ b/test cases/common/147 simd/simdchecker.c @@ -0,0 +1,143 @@ +#include<simdfuncs.h> +#include<stdio.h> +#include<string.h> + +typedef void (*simd_func)(float*); + +int check_simd_implementation(float *four, + const float *four_initial, + const char *simd_type, + const float *expected, + simd_func fptr, + const int blocksize) { + int rv = 0; + memcpy(four, four_initial, blocksize*sizeof(float)); + printf("Using %s.\n", simd_type); + fptr(four); + for(int i=0; i<blocksize; i++) { + if(four[i] != expected[i]) { + printf("Increment function failed, got %f expected %f.\n", four[i], expected[i]); + rv = 1; + } + } + return rv; +} + +int main(void) { + static const float four_initial[4] = {2.0, 3.0, 4.0, 5.0}; + ALIGN_16 float four[4]; + const float expected[4] = {3.0, 4.0, 5.0, 6.0}; + int r=0; + const int blocksize = 4; + +/* + * Test all implementations that the current CPU supports. + */ +#if HAVE_NEON + if(neon_available()) { + r += check_simd_implementation(four, + four_initial, + "NEON", + expected, + increment_neon, + blocksize); + } +#endif +#if HAVE_AVX2 + if(avx2_available()) { + r += check_simd_implementation(four, + four_initial, + "AVX2", + expected, + increment_avx2, + blocksize); + } +#endif +#if HAVE_AVX + if(avx_available()) { + r += check_simd_implementation(four, + four_initial, + "AVC", + expected, + increment_avx, + blocksize); + } +#endif +#if HAVE_SSE42 + if(sse42_available()) { + r += check_simd_implementation(four, + four_initial, + "SSR42", + expected, + increment_sse42, + blocksize); + } +#endif +#if HAVE_SSE41 + if(sse41_available()) { + r += check_simd_implementation(four, + four_initial, + "SSE41", + expected, + increment_sse41, + blocksize); + } +#endif +#if HAVE_SSSE3 + if(ssse3_available()) { + r += check_simd_implementation(four, + four_initial, + "SSSE3", + expected, + increment_ssse3, + blocksize); + } +#endif +#if HAVE_SSE3 + if(sse3_available()) { + r += check_simd_implementation(four, + four_initial, + "SSE3", + expected, + increment_sse3, + blocksize); + } +#endif +#if HAVE_SSE2 + if(sse2_available()) { + r += check_simd_implementation(four, + four_initial, + "SSE2", + expected, + increment_sse2, + blocksize); + } +#endif +#if HAVE_SSE + if(sse_available()) { + r += check_simd_implementation(four, + four_initial, + "SSE", + expected, + increment_sse, + blocksize); + } +#endif +#if HAVE_MMX + if(mmx_available()) { + r += check_simd_implementation(four, + four_initial, + "MMX", + expected, + increment_mmx, + blocksize); + } +#endif + r += check_simd_implementation(four, + four_initial, + "fallback", + expected, + increment_fallback, + blocksize); + return r; +} diff --git a/test cases/common/147 simd/simdfuncs.h b/test cases/common/147 simd/simdfuncs.h new file mode 100644 index 0000000..d820f25 --- /dev/null +++ b/test cases/common/147 simd/simdfuncs.h @@ -0,0 +1,75 @@ +#pragma once + +#include<simdconfig.h> + +#ifdef _MSC_VER +#define ALIGN_16 __declspec(align(16)) +#else +#include<stdalign.h> +#define ALIGN_16 alignas(16) +#endif + + +/* Yes, I do know that arr[4] decays into a pointer + * as a function argument. Don't do this in real code + * but for this test it is ok. + */ + +void increment_fallback(float arr[4]); + +#if HAVE_MMX +int mmx_available(void); +void increment_mmx(float arr[4]); +#endif + +#if HAVE_SSE +int sse_available(void); +void increment_sse(float arr[4]); +#endif + +#if HAVE_SSE2 +int sse2_available(void); +void increment_sse2(float arr[4]); +#endif + +#if HAVE_SSE3 +int sse3_available(void); +void increment_sse3(float arr[4]); +#endif + +#if HAVE_SSSE3 +int ssse3_available(void); +void increment_ssse3(float arr[4]); +#endif + +#if HAVE_SSE41 +int sse41_available(void); +void increment_sse41(float arr[4]); +#endif + +#if HAVE_SSE42 +int sse42_available(void); +void increment_sse42(float arr[4]); +#endif + +#if HAVE_AVX +int avx_available(void); +void increment_avx(float arr[4]); +#endif + +#if HAVE_AVX2 +int avx2_available(void); +void increment_avx2(float arr[4]); +#endif + +#if HAVE_NEON +int neon_available(void); +void increment_neon(float arr[4]); +#endif + +#if HAVE_ALTIVEC +int altivec_available(void); +void increment_altivec(float arr[4]); +#endif + +/* And so on. */ |