diff options
Diffstat (limited to 'ml/dlib/dlib/simd')
-rw-r--r-- | ml/dlib/dlib/simd/simd4f.h | 685 | ||||
-rw-r--r-- | ml/dlib/dlib/simd/simd4i.h | 566 | ||||
-rw-r--r-- | ml/dlib/dlib/simd/simd8f.h | 402 | ||||
-rw-r--r-- | ml/dlib/dlib/simd/simd8i.h | 339 | ||||
-rw-r--r-- | ml/dlib/dlib/simd/simd_check.h | 177 |
5 files changed, 2169 insertions, 0 deletions
diff --git a/ml/dlib/dlib/simd/simd4f.h b/ml/dlib/dlib/simd/simd4f.h new file mode 100644 index 000000000..2bfadd23f --- /dev/null +++ b/ml/dlib/dlib/simd/simd4f.h @@ -0,0 +1,685 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_sIMD4F_Hh_ +#define DLIB_sIMD4F_Hh_ + +#include "simd_check.h" +#include "simd4i.h" +#include <cmath> +#include <iostream> + +namespace dlib +{ + +#ifdef DLIB_HAVE_SSE2 + class simd4f + { + public: + typedef float type; + + inline simd4f() {} + inline simd4f(float f) { x = _mm_set1_ps(f); } + inline simd4f(float r0, float r1, float r2, float r3) { x = _mm_setr_ps(r0,r1,r2,r3); } + inline simd4f(const __m128& val):x(val) {} + inline simd4f(const simd4i& val):x(_mm_cvtepi32_ps(val)) {} + + inline simd4f& operator=(const simd4i& val) + { + x = simd4f(val); + return *this; + } + + inline simd4f& operator=(const float& val) + { + x = simd4f(val); + return *this; + } + + inline simd4f& operator=(const __m128& val) + { + x = val; + return *this; + } + + inline operator __m128() const { return x; } + + // truncate to 32bit integers + inline operator __m128i() const { return _mm_cvttps_epi32(x); } + + inline void load_aligned(const type* ptr) { x = _mm_load_ps(ptr); } + inline void store_aligned(type* ptr) const { _mm_store_ps(ptr, x); } + inline void load(const type* ptr) { x = _mm_loadu_ps(ptr); } + inline void store(type* ptr) const { _mm_storeu_ps(ptr, x); } + + inline unsigned int size() const { return 4; } + inline float operator[](unsigned int idx) const + { + float temp[4]; + store(temp); + return temp[idx]; + } + + private: + __m128 x; + }; + + class simd4f_bool + { + public: + typedef float type; + + inline simd4f_bool() {} + inline simd4f_bool(const __m128& val):x(val) {} + + inline simd4f_bool& operator=(const __m128& val) + { + x = val; + return *this; + } + + inline operator __m128() const { return x; } + + + private: + __m128 x; + }; + +#elif defined(DLIB_HAVE_VSX) + + class simd4f + { + typedef union { + vector float v; + float x[4]; + } v4f; + + v4f x; + + public: + inline simd4f() : x{0,0,0,0} {} + inline simd4f(const simd4f& v) : x(v.x) { } + inline simd4f(const vector float& v) : x{v} { } + + inline simd4f(const simd4i& v) { + x.x[0]=v[0]; x.x[1]=v[1]; x.x[2]=v[2]; x.x[3]=v[3]; + } + + + inline simd4f(float f) : x{f,f,f,f} { } + inline simd4f(float r0, float r1, float r2, float r3) + : x{r0,r1,r2,r3} { } + + inline simd4f& operator=(const simd4f& v) { x = v.x; return *this; } + inline simd4f& operator=(const float& v) { *this = simd4f(v); return *this; } + + inline vector float operator() () const { return x.v; } + inline float operator[](unsigned int idx) const { return x.x[idx]; } + + inline void load_aligned(const float* ptr) { x.v = vec_ld(0, ptr); } + inline void store_aligned(float* ptr) const { vec_st(x.v, 0, ptr); } + inline void load(const float* ptr) { x.v = vec_vsx_ld(0, ptr); } + inline void store(float* ptr) const { vec_vsx_st(x.v, 0, ptr); } + + + // truncate to 32bit integers + inline operator simd4i::rawarray() const + { + simd4i::rawarray temp; + temp.v.x[0] = x.x[0]; + temp.v.x[1] = x.x[1]; + temp.v.x[2] = x.x[2]; + temp.v.x[3] = x.x[3]; + return temp; + } + }; + + typedef simd4i simd4f_bool; + +#elif defined(DLIB_HAVE_NEON) + + class simd4f + { + public: + typedef float type; + + inline simd4f() {} + inline simd4f(float f) { x = vdupq_n_f32(f); } + inline simd4f(float r0, float r1, float r2, float r3) + { + float __attribute__ ((aligned (16))) data[4] = { r0, r1, r2, r3 }; + x = vld1q_f32(data); + } + inline simd4f(const float32x4_t& val):x(val) {} + inline simd4f(const simd4i& val):x(vcvtq_f32_s32(val)) {} + + inline simd4f& operator=(const simd4i& val) + { + x = simd4f(val); + return *this; + } + + inline simd4f& operator=(const float& val) + { + x = simd4f(val); + return *this; + } + + inline simd4f& operator=(const float32x4_t& val) + { + x = val; + return *this; + } + + inline operator float32x4_t() const { return x; } + + // truncate to 32bit integers + inline operator int32x4_t() const { return vcvtq_s32_f32(x); } + + inline void load_aligned(const type* ptr) { x = vld1q_f32(ptr); } + inline void store_aligned(type* ptr) const { vst1q_f32(ptr, x); } + inline void load(const type* ptr) { x = vld1q_f32(ptr); } + inline void store(type* ptr) const { vst1q_f32(ptr, x); } + + inline unsigned int size() const { return 4; } + inline float operator[](unsigned int idx) const + { + float temp[4]; + store(temp); + return temp[idx]; + } + + private: + float32x4_t x; + }; + + + typedef simd4i simd4f_bool; +#else + class simd4f + { + public: + typedef float type; + + inline simd4f() {} + inline simd4f(float f) { x[0]=f; x[1]=f; x[2]=f; x[3]=f; } + inline simd4f(float r0, float r1, float r2, float r3) { x[0]=r0; x[1]=r1; x[2]=r2; x[3]=r3;} + inline simd4f(const simd4i& val) { x[0]=val[0]; x[1]=val[1]; x[2]=val[2]; x[3]=val[3];} + + // truncate to 32bit integers + inline operator simd4i::rawarray() const + { + simd4i::rawarray temp; + temp.a[0] = (int32)x[0]; + temp.a[1] = (int32)x[1]; + temp.a[2] = (int32)x[2]; + temp.a[3] = (int32)x[3]; + return temp; + } + + inline simd4f& operator=(const float& val) + { + *this = simd4f(val); + return *this; + } + + inline simd4f& operator=(const simd4i& val) + { + x[0] = val[0]; + x[1] = val[1]; + x[2] = val[2]; + x[3] = val[3]; + return *this; + } + + + inline void load_aligned(const type* ptr) + { + x[0] = ptr[0]; + x[1] = ptr[1]; + x[2] = ptr[2]; + x[3] = ptr[3]; + } + + inline void store_aligned(type* ptr) const + { + ptr[0] = x[0]; + ptr[1] = x[1]; + ptr[2] = x[2]; + ptr[3] = x[3]; + } + + inline void load(const type* ptr) + { + x[0] = ptr[0]; + x[1] = ptr[1]; + x[2] = ptr[2]; + x[3] = ptr[3]; + } + + inline void store(type* ptr) const + { + ptr[0] = x[0]; + ptr[1] = x[1]; + ptr[2] = x[2]; + ptr[3] = x[3]; + } + + inline unsigned int size() const { return 4; } + inline float operator[](unsigned int idx) const { return x[idx]; } + + private: + float x[4]; + }; + + + class simd4f_bool + { + public: + typedef float type; + + inline simd4f_bool() {} + inline simd4f_bool(bool r0, bool r1, bool r2, bool r3) { x[0]=r0; x[1]=r1; x[2]=r2; x[3]=r3;} + + inline bool operator[](unsigned int idx) const { return x[idx]; } + private: + bool x[4]; + }; + +#endif + +// ---------------------------------------------------------------------------------------- + + inline std::ostream& operator<<(std::ostream& out, const simd4f& item) + { + float temp[4]; + item.store(temp); + out << "(" << temp[0] << ", " << temp[1] << ", " << temp[2] << ", " << temp[3] << ")"; + return out; + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f operator+ (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_add_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_add(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vaddq_f32(lhs, rhs); +#else + return simd4f(lhs[0]+rhs[0], + lhs[1]+rhs[1], + lhs[2]+rhs[2], + lhs[3]+rhs[3]); +#endif + } + inline simd4f& operator+= (simd4f& lhs, const simd4f& rhs) + { lhs = lhs + rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd4f operator- (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_sub_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_sub(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vsubq_f32(lhs, rhs); +#else + return simd4f(lhs[0]-rhs[0], + lhs[1]-rhs[1], + lhs[2]-rhs[2], + lhs[3]-rhs[3]); +#endif + } + inline simd4f& operator-= (simd4f& lhs, const simd4f& rhs) + { lhs = lhs - rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd4f operator* (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_mul_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_mul(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vmulq_f32(lhs, rhs); +#else + return simd4f(lhs[0]*rhs[0], + lhs[1]*rhs[1], + lhs[2]*rhs[2], + lhs[3]*rhs[3]); +#endif + } + inline simd4f& operator*= (simd4f& lhs, const simd4f& rhs) + { lhs = lhs * rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd4f operator/ (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_div_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_div(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + float32x4_t reciprocal = vrecpeq_f32(rhs); + reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal); + float32x4_t result = vmulq_f32(lhs,reciprocal); + return result; +#else + return simd4f(lhs[0]/rhs[0], + lhs[1]/rhs[1], + lhs[2]/rhs[2], + lhs[3]/rhs[3]); +#endif + } + inline simd4f& operator/= (simd4f& lhs, const simd4f& rhs) + { lhs = lhs / rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd4f_bool operator== (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_cmpeq_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_cmpeq(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vceqq_f32(lhs, rhs); +#else + return simd4f_bool(lhs[0]==rhs[0], + lhs[1]==rhs[1], + lhs[2]==rhs[2], + lhs[3]==rhs[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f_bool operator!= (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_cmpneq_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) || defined(DLIB_HAVE_NEON) + return ~(lhs==rhs); // simd4f_bool is simd4i typedef, can use ~ +#else + return simd4f_bool(lhs[0]!=rhs[0], + lhs[1]!=rhs[1], + lhs[2]!=rhs[2], + lhs[3]!=rhs[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f_bool operator< (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_cmplt_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_cmplt(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vcltq_f32(lhs, rhs); +#else + return simd4f_bool(lhs[0]<rhs[0], + lhs[1]<rhs[1], + lhs[2]<rhs[2], + lhs[3]<rhs[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f_bool operator> (const simd4f& lhs, const simd4f& rhs) + { + return rhs < lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f_bool operator<= (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_cmple_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_cmple(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vcleq_f32(lhs, rhs); +#else + return simd4f_bool(lhs[0]<=rhs[0], + lhs[1]<=rhs[1], + lhs[2]<=rhs[2], + lhs[3]<=rhs[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f_bool operator>= (const simd4f& lhs, const simd4f& rhs) + { + return rhs <= lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f min (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_min_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_min(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vminq_f32(lhs, rhs); +#else + return simd4f(std::min(lhs[0],rhs[0]), + std::min(lhs[1],rhs[1]), + std::min(lhs[2],rhs[2]), + std::min(lhs[3],rhs[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f max (const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_max_ps(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_max(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vmaxq_f32(lhs, rhs); +#else + return simd4f(std::max(lhs[0],rhs[0]), + std::max(lhs[1],rhs[1]), + std::max(lhs[2],rhs[2]), + std::max(lhs[3],rhs[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f reciprocal (const simd4f& item) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_rcp_ps(item); +#elif defined(DLIB_HAVE_VSX) + return vec_re(item()); +#elif defined(DLIB_HAVE_NEON) + float32x4_t estimate = vrecpeq_f32(item); + estimate = vmulq_f32(vrecpsq_f32(estimate , item), estimate ); + estimate = vmulq_f32(vrecpsq_f32(estimate , item), estimate ); + return estimate ; +#else + return simd4f(1.0f/item[0], + 1.0f/item[1], + 1.0f/item[2], + 1.0f/item[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f reciprocal_sqrt (const simd4f& item) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_rsqrt_ps(item); +#elif defined(DLIB_HAVE_VSX) + return vec_rsqrt(item()); +#elif defined(DLIB_HAVE_NEON) + float32x4_t estimate = vrsqrteq_f32(item); + simd4f estimate2 = vmulq_f32(estimate, item); + estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate)); + return estimate; +#else + return simd4f(1.0f/std::sqrt(item[0]), + 1.0f/std::sqrt(item[1]), + 1.0f/std::sqrt(item[2]), + 1.0f/std::sqrt(item[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline float dot(const simd4f& lhs, const simd4f& rhs); + inline float sum(const simd4f& item) + { +#ifdef DLIB_HAVE_SSE41 + return dot(simd4f(1), item); +#elif defined(DLIB_HAVE_SSE3) + simd4f temp = _mm_hadd_ps(item,item); + return _mm_cvtss_f32(_mm_hadd_ps(temp,temp)); +#elif defined(DLIB_HAVE_SSE2) && (!defined(_MSC_VER) || _MSC_VER!=1400) + simd4f temp = _mm_add_ps(item,_mm_movehl_ps(item,item)); + simd4f temp2 = _mm_shuffle_ps(temp,temp,1); + return _mm_cvtss_f32(_mm_add_ss(temp,temp2)); +#elif defined(DLIB_HAVE_NEON) + float32x2_t r = vadd_f32(vget_high_f32(item), vget_low_f32(item)); + return vget_lane_f32(vpadd_f32(r, r), 0); +#else + return item[0]+item[1]+item[2]+item[3]; +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline float dot(const simd4f& lhs, const simd4f& rhs) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_cvtss_f32(_mm_dp_ps(lhs, rhs, 0xff)); +#else + return sum(lhs*rhs); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f sqrt(const simd4f& item) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_sqrt_ps(item); +#elif defined(DLIB_HAVE_VSX) + return vec_sqrt(item()); +#elif defined(DLIB_HAVE_NEON) + float32x4_t q_step_0 = vrsqrteq_f32(item); + float32x4_t q_step_parm0 = vmulq_f32(item, q_step_0); + float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0); + float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0); + float32x4_t q_step_parm1 = vmulq_f32(item, q_step_1); + float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1); + float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1); + float32x4_t res3 = vmulq_f32(item, q_step_2); + + // normalize sqrt(0)=0 + uint32x4_t zcomp = vceqq_f32(vdupq_n_f32(0), item); + float32x4_t rcorr = vbslq_f32(zcomp, item, res3); + return rcorr; +#else + return simd4f(std::sqrt(item[0]), + std::sqrt(item[1]), + std::sqrt(item[2]), + std::sqrt(item[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f ceil(const simd4f& item) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_ceil_ps(item); +#elif defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_NEON) + float temp[4]; + item.store(temp); + temp[0] = std::ceil(temp[0]); + temp[1] = std::ceil(temp[1]); + temp[2] = std::ceil(temp[2]); + temp[3] = std::ceil(temp[3]); + simd4f temp2; + temp2.load(temp); + return temp2; +#elif defined(DLIB_HAVE_VSX) + return vec_ceil(item()); +#else + return simd4f(std::ceil(item[0]), + std::ceil(item[1]), + std::ceil(item[2]), + std::ceil(item[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4f floor(const simd4f& item) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_floor_ps(item); +#elif defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_NEON) + float temp[4]; + item.store(temp); + temp[0] = std::floor(temp[0]); + temp[1] = std::floor(temp[1]); + temp[2] = std::floor(temp[2]); + temp[3] = std::floor(temp[3]); + simd4f temp2; + temp2.load(temp); + return temp2; +#elif defined(DLIB_HAVE_VSX) + return vec_floor(item()); +#else + return simd4f(std::floor(item[0]), + std::floor(item[1]), + std::floor(item[2]), + std::floor(item[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + // perform cmp ? a : b + inline simd4f select(const simd4f_bool& cmp, const simd4f& a, const simd4f& b) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_blendv_ps(b,a,cmp); +#elif defined(DLIB_HAVE_SSE2) + return _mm_or_ps(_mm_and_ps(cmp,a) , _mm_andnot_ps(cmp,b)); +#elif defined(DLIB_HAVE_NEON) + return vbslq_f32(cmp, a, b); +#else + return simd4f(cmp[0]?a[0]:b[0], + cmp[1]?a[1]:b[1], + cmp[2]?a[2]:b[2], + cmp[3]?a[3]:b[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_sIMD4F_Hh_ + diff --git a/ml/dlib/dlib/simd/simd4i.h b/ml/dlib/dlib/simd/simd4i.h new file mode 100644 index 000000000..ea33f14a8 --- /dev/null +++ b/ml/dlib/dlib/simd/simd4i.h @@ -0,0 +1,566 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_sIMD4I_Hh_ +#define DLIB_sIMD4I_Hh_ + +#include "simd_check.h" +#include "../uintn.h" + +namespace dlib +{ + +#ifdef DLIB_HAVE_SSE2 + class simd4i + { + public: + typedef int32 type; + + inline simd4i() {} + inline simd4i(int32 f) { x = _mm_set1_epi32(f); } + inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3) { x = _mm_setr_epi32(r0,r1,r2,r3); } + inline simd4i(const __m128i& val):x(val) {} + + inline simd4i& operator=(const __m128i& val) + { + x = val; + return *this; + } + + inline operator __m128i() const { return x; } + + inline void load_aligned(const type* ptr) { x = _mm_load_si128((const __m128i*)ptr); } + inline void store_aligned(type* ptr) const { _mm_store_si128((__m128i*)ptr, x); } + inline void load(const type* ptr) { x = _mm_loadu_si128((const __m128i*)ptr); } + inline void store(type* ptr) const { _mm_storeu_si128((__m128i*)ptr, x); } + + inline unsigned int size() const { return 4; } + inline int32 operator[](unsigned int idx) const + { + int32 temp[4]; + store(temp); + return temp[idx]; + } + + private: + __m128i x; + }; + +#elif defined(DLIB_HAVE_VSX) + + class simd4i + { + typedef union { + vector signed int v; + vector bool int b; + signed int x[4]; + } v4i; + + v4i x; + + public: + inline simd4i() : x{0,0,0,0} { } + inline simd4i(const simd4i& v) : x(v.x) { } + inline simd4i(const vector int& v) : x{v} { } + inline simd4i(const vector bool int& b) { x.b=b; } + + inline simd4i(int32 f) : x{f,f,f,f} { } + inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3) + : x{r0,r1,r2,r3} { } + + inline simd4i& operator=(const simd4i& v) { x = v.x; return *this; } + inline simd4i& operator=(const int32& v) { *this = simd4i(v); return *this; } + + inline vector signed int operator() () const { return x.v; } + inline int32 operator[](unsigned int idx) const { return x.x[idx]; } + + inline vector bool int to_bool() const { return x.b; } + + // intrinsics now seem to use xxpermdi automatically now + inline void load_aligned(const int32* ptr) { x.v = vec_ld(0, ptr); } + inline void store_aligned(int32* ptr) const { vec_st(x.v, 0, ptr); } + inline void load(const int32* ptr) { x.v = vec_vsx_ld(0, ptr); } + inline void store(int32* ptr) const { vec_vsx_st(x.v, 0, ptr); } + + + struct rawarray + { + v4i v; + }; + inline simd4i(const rawarray& a) : x{a.v} { } + + }; + +#elif defined(DLIB_HAVE_NEON) + + class simd4i + { + public: + typedef int32 type; + + inline simd4i() {} + inline simd4i(int32 f) { x = vdupq_n_s32(f); } + inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3) + { + int32 __attribute__((aligned(16))) data[4] = { r0, r1, r2, r3 }; + x = vld1q_s32(data); + } + inline simd4i(const int32x4_t& val):x(val) {} + + inline simd4i& operator=(const int32x4_t& val) + { + x = val; + return *this; + } + + inline operator int32x4_t() const { return x; } + inline operator uint32x4_t() const { return (uint32x4_t)x; } + + inline void load_aligned(const type* ptr) { x = vld1q_s32(ptr); } + inline void store_aligned(type* ptr) const { vst1q_s32(ptr, x); } + inline void load(const type* ptr) { x = vld1q_s32(ptr); } + inline void store(type* ptr) const { vst1q_s32(ptr, x); } + + inline unsigned int size() const { return 4; } + inline int32 operator[](unsigned int idx) const + { + int32 temp[4]; + store(temp); + return temp[idx]; + } + + private: + int32x4_t x; + }; + +#else + + class simd4i + { + public: + typedef int32 type; + + inline simd4i() {} + inline simd4i(int32 f) { x[0]=f; x[1]=f; x[2]=f; x[3]=f; } + inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3) { x[0]=r0; x[1]=r1; x[2]=r2; x[3]=r3;} + + struct rawarray + { + int32 a[4]; + }; + inline simd4i(const rawarray& a) { x[0]=a.a[0]; x[1]=a.a[1]; x[2]=a.a[2]; x[3]=a.a[3]; } + + inline void load_aligned(const type* ptr) + { + x[0] = ptr[0]; + x[1] = ptr[1]; + x[2] = ptr[2]; + x[3] = ptr[3]; + } + + inline void store_aligned(type* ptr) const + { + ptr[0] = x[0]; + ptr[1] = x[1]; + ptr[2] = x[2]; + ptr[3] = x[3]; + } + + inline void load(const type* ptr) + { + x[0] = ptr[0]; + x[1] = ptr[1]; + x[2] = ptr[2]; + x[3] = ptr[3]; + } + + inline void store(type* ptr) const + { + ptr[0] = x[0]; + ptr[1] = x[1]; + ptr[2] = x[2]; + ptr[3] = x[3]; + } + + inline unsigned int size() const { return 4; } + inline int32 operator[](unsigned int idx) const { return x[idx]; } + + private: + int32 x[4]; + }; +#endif + +// ---------------------------------------------------------------------------------------- + + inline std::ostream& operator<<(std::ostream& out, const simd4i& item) + { + int32 temp[4]; + item.store(temp); + out << "(" << temp[0] << ", " << temp[1] << ", " << temp[2] << ", " << temp[3] << ")"; + return out; + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator+ (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_add_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_add(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vaddq_s32(lhs, rhs); +#else + return simd4i(lhs[0]+rhs[0], + lhs[1]+rhs[1], + lhs[2]+rhs[2], + lhs[3]+rhs[3]); +#endif + } + inline simd4i& operator+= (simd4i& lhs, const simd4i& rhs) + { return lhs = lhs + rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator- (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_sub_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_sub(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vsubq_s32(lhs, rhs); +#else + return simd4i(lhs[0]-rhs[0], + lhs[1]-rhs[1], + lhs[2]-rhs[2], + lhs[3]-rhs[3]); +#endif + } + inline simd4i& operator-= (simd4i& lhs, const simd4i& rhs) + { return lhs = lhs - rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator* (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_mullo_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_SSE2) + int32 _lhs[4]; lhs.store(_lhs); + int32 _rhs[4]; rhs.store(_rhs); + return simd4i(_lhs[0]*_rhs[0], + _lhs[1]*_rhs[1], + _lhs[2]*_rhs[2], + _lhs[3]*_rhs[3]); +#elif defined(DLIB_HAVE_VSX) + vector int a = lhs(), b = rhs(); + asm("vmuluwm %0, %0, %1\n\t" : "+&v" (a) : "v" (b) ); + return simd4i(a); +#elif defined(DLIB_HAVE_NEON) + return vmulq_s32(lhs, rhs); +#else + return simd4i(lhs[0]*rhs[0], + lhs[1]*rhs[1], + lhs[2]*rhs[2], + lhs[3]*rhs[3]); +#endif + } + inline simd4i& operator*= (simd4i& lhs, const simd4i& rhs) + { return lhs = lhs * rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator& (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_and_si128(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_and(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vandq_s32(lhs, rhs); +#else + return simd4i(lhs[0]&rhs[0], + lhs[1]&rhs[1], + lhs[2]&rhs[2], + lhs[3]&rhs[3]); +#endif + } + inline simd4i& operator&= (simd4i& lhs, const simd4i& rhs) + { return lhs = lhs & rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator| (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_or_si128(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_or(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vorrq_s32(lhs, rhs); +#else + return simd4i(lhs[0]|rhs[0], + lhs[1]|rhs[1], + lhs[2]|rhs[2], + lhs[3]|rhs[3]); +#endif + } + inline simd4i& operator|= (simd4i& lhs, const simd4i& rhs) + { return lhs = lhs | rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator^ (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_xor_si128(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_xor(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return veorq_s32(lhs, rhs); +#else + return simd4i(lhs[0]^rhs[0], + lhs[1]^rhs[1], + lhs[2]^rhs[2], + lhs[3]^rhs[3]); +#endif + } + inline simd4i& operator^= (simd4i& lhs, const simd4i& rhs) + { return lhs = lhs ^ rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator~ (const simd4i& lhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_xor_si128(lhs, _mm_set1_epi32(0xFFFFFFFF)); +#elif defined(DLIB_HAVE_VSX) + return vec_xor(lhs(), vec_splats(~0)); +#elif defined(DLIB_HAVE_NEON) + return vmvnq_s32(lhs); +#else + return simd4i(~lhs[0], + ~lhs[1], + ~lhs[2], + ~lhs[3]); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator<< (const simd4i& lhs, const int& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_sll_epi32(lhs,_mm_cvtsi32_si128(rhs)); +#elif defined(DLIB_HAVE_VSX) + return vec_sl(lhs(), vec_splats((uint32_t)rhs)); +#elif defined(DLIB_HAVE_NEON) + return vshlq_s32(lhs, simd4i(rhs)); +#else + return simd4i(lhs[0]<<rhs, + lhs[1]<<rhs, + lhs[2]<<rhs, + lhs[3]<<rhs); +#endif + } + inline simd4i& operator<<= (simd4i& lhs, const int& rhs) + { return lhs = lhs << rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator>> (const simd4i& lhs, const int& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_sra_epi32(lhs,_mm_cvtsi32_si128(rhs)); +#elif defined(DLIB_HAVE_VSX) + return vec_sr(lhs(), vec_splats((uint32_t)rhs)); +#elif defined(DLIB_HAVE_NEON) + int32 _lhs[4]; lhs.store(_lhs); + return simd4i(_lhs[0]>>rhs, + _lhs[1]>>rhs, + _lhs[2]>>rhs, + _lhs[3]>>rhs); +#else + return simd4i(lhs[0]>>rhs, + lhs[1]>>rhs, + lhs[2]>>rhs, + lhs[3]>>rhs); +#endif + } + inline simd4i& operator>>= (simd4i& lhs, const int& rhs) + { return lhs = lhs >> rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator== (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_cmpeq_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_cmpeq(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vceqq_s32(lhs,rhs); +#else + return simd4i(lhs[0]==rhs[0] ? 0xFFFFFFFF : 0, + lhs[1]==rhs[1] ? 0xFFFFFFFF : 0, + lhs[2]==rhs[2] ? 0xFFFFFFFF : 0, + lhs[3]==rhs[3] ? 0xFFFFFFFF : 0); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator!= (const simd4i& lhs, const simd4i& rhs) + { +#if defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_VSX) || defined(DLIB_HAVE_NEON) + return ~(lhs==rhs); +#else + return simd4i(lhs[0]!=rhs[0] ? 0xFFFFFFFF : 0, + lhs[1]!=rhs[1] ? 0xFFFFFFFF : 0, + lhs[2]!=rhs[2] ? 0xFFFFFFFF : 0, + lhs[3]!=rhs[3] ? 0xFFFFFFFF : 0); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator< (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return _mm_cmplt_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_VSX) + return vec_cmplt(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vcltq_s32(lhs, rhs); +#else + return simd4i(lhs[0]<rhs[0] ? 0xFFFFFFFF : 0, + lhs[1]<rhs[1] ? 0xFFFFFFFF : 0, + lhs[2]<rhs[2] ? 0xFFFFFFFF : 0, + lhs[3]<rhs[3] ? 0xFFFFFFFF : 0); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator> (const simd4i& lhs, const simd4i& rhs) + { + return rhs < lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator<= (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE2 + return ~(lhs > rhs); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vcleq_s32(lhs, rhs); +#else + return simd4i(lhs[0]<=rhs[0] ? 0xFFFFFFFF : 0, + lhs[1]<=rhs[1] ? 0xFFFFFFFF : 0, + lhs[2]<=rhs[2] ? 0xFFFFFFFF : 0, + lhs[3]<=rhs[3] ? 0xFFFFFFFF : 0); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i operator>= (const simd4i& lhs, const simd4i& rhs) + { + return rhs <= lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i min (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_min_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_SSE2) + int32 _lhs[4]; lhs.store(_lhs); + int32 _rhs[4]; rhs.store(_rhs); + return simd4i(std::min(_lhs[0],_rhs[0]), + std::min(_lhs[1],_rhs[1]), + std::min(_lhs[2],_rhs[2]), + std::min(_lhs[3],_rhs[3])); +#elif defined(DLIB_HAVE_VSX) + return vec_min(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return (int32x4_t)vminq_s32(lhs, rhs); +#else + return simd4i(std::min(lhs[0],rhs[0]), + std::min(lhs[1],rhs[1]), + std::min(lhs[2],rhs[2]), + std::min(lhs[3],rhs[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd4i max (const simd4i& lhs, const simd4i& rhs) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_max_epi32(lhs, rhs); +#elif defined(DLIB_HAVE_SSE2) + int32 _lhs[4]; lhs.store(_lhs); + int32 _rhs[4]; rhs.store(_rhs); + return simd4i(std::max(_lhs[0],_rhs[0]), + std::max(_lhs[1],_rhs[1]), + std::max(_lhs[2],_rhs[2]), + std::max(_lhs[3],_rhs[3])); +#elif defined(DLIB_HAVE_VSX) + return vec_max(lhs(), rhs()); +#elif defined(DLIB_HAVE_NEON) + return vmaxq_s32(lhs, rhs); +#else + return simd4i(std::max(lhs[0],rhs[0]), + std::max(lhs[1],rhs[1]), + std::max(lhs[2],rhs[2]), + std::max(lhs[3],rhs[3])); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline int32 sum(const simd4i& item) + { +#ifdef DLIB_HAVE_SSE3 + simd4i temp = _mm_hadd_epi32(item,item); + temp = _mm_hadd_epi32(temp,temp); + return _mm_cvtsi128_si32(temp); +#elif defined(DLIB_HAVE_SSE2) + int32 temp[4]; + item.store(temp); + return temp[0]+temp[1]+temp[2]+temp[3]; +#elif defined(DLIB_HAVE_NEON) + int32x2_t r = vadd_s32(vget_high_s32(item), vget_low_s32(item)); + return vget_lane_s32(vpadd_s32(r, r), 0); +#else + return item[0]+item[1]+item[2]+item[3]; +#endif + } + +// ---------------------------------------------------------------------------------------- + + // perform cmp ? a : b + inline simd4i select(const simd4i& cmp, const simd4i& a, const simd4i& b) + { +#ifdef DLIB_HAVE_SSE41 + return _mm_blendv_epi8(b,a,cmp); +#elif defined(DLIB_HAVE_SSE2) + return ((cmp&a) | _mm_andnot_si128(cmp,b)); +#elif defined(DLIB_HAVE_VSX) + return vec_sel(b(), a(), cmp.to_bool()); +#elif defined(DLIB_HAVE_NEON) + return vbslq_s32(cmp, a, b); +#else + return ((cmp&a) | (~cmp&b)); +#endif + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_sIMD4I_Hh_ + diff --git a/ml/dlib/dlib/simd/simd8f.h b/ml/dlib/dlib/simd/simd8f.h new file mode 100644 index 000000000..628ba74ee --- /dev/null +++ b/ml/dlib/dlib/simd/simd8f.h @@ -0,0 +1,402 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_sIMD8F_Hh_ +#define DLIB_sIMD8F_Hh_ + +#include "simd_check.h" +#include "simd4f.h" +#include "simd8i.h" + +namespace dlib +{ +#ifdef DLIB_HAVE_AVX + class simd8f + { + public: + typedef float type; + + inline simd8f() {} + inline simd8f(const simd4f& low, const simd4f& high) + { + x = _mm256_insertf128_ps(_mm256_castps128_ps256(low),high,1); + } + inline simd8f(float f) { x = _mm256_set1_ps(f); } + inline simd8f(float r0, float r1, float r2, float r3, float r4, float r5, float r6, float r7) + { x = _mm256_setr_ps(r0,r1,r2,r3,r4,r5,r6,r7); } + + inline simd8f(const simd8i& val):x(_mm256_cvtepi32_ps(val)) {} + inline simd8f(const __m256& val):x(val) {} + inline simd8f& operator=(const __m256& val) + { + x = val; + return *this; + } + inline operator __m256() const { return x; } + + // truncate to 32bit integers + inline operator __m256i() const { return _mm256_cvttps_epi32(x); } + + inline void load_aligned(const type* ptr) { x = _mm256_load_ps(ptr); } + inline void store_aligned(type* ptr) const { _mm256_store_ps(ptr, x); } + inline void load(const type* ptr) { x = _mm256_loadu_ps(ptr); } + inline void store(type* ptr) const { _mm256_storeu_ps(ptr, x); } + + inline simd8f& operator=(const simd8i& rhs) { *this = simd8f(rhs); return *this; } + inline simd8f& operator=(const float& val) + { + x = simd8f(val); + return *this; + } + + inline unsigned int size() const { return 8; } + inline float operator[](unsigned int idx) const + { + float temp[8]; + store(temp); + return temp[idx]; + } + + inline simd4f low() const { return _mm256_castps256_ps128(x); } + inline simd4f high() const { return _mm256_extractf128_ps(x,1); } + + private: + __m256 x; + }; + + + class simd8f_bool + { + public: + typedef float type; + + inline simd8f_bool() {} + inline simd8f_bool(const __m256& val):x(val) {} + inline simd8f_bool(const simd4f_bool& low, const simd4f_bool& high) + { + x = _mm256_insertf128_ps(_mm256_castps128_ps256(low),high,1); + } + + inline simd8f_bool& operator=(const __m256& val) + { + x = val; + return *this; + } + + inline operator __m256() const { return x; } + + + private: + __m256 x; + }; + +#else + class simd8f + { + public: + typedef float type; + + inline simd8f() {} + inline simd8f(const simd4f& low_, const simd4f& high_): _low(low_),_high(high_){} + inline simd8f(float f) :_low(f),_high(f) {} + inline simd8f(float r0, float r1, float r2, float r3, float r4, float r5, float r6, float r7) : + _low(r0,r1,r2,r3), _high(r4,r5,r6,r7) {} + inline simd8f(const simd8i& val) : _low(val.low()), _high(val.high()) { } + + // truncate to 32bit integers + inline operator simd8i::rawarray() const + { + simd8i::rawarray temp; + temp.low = simd4i(_low); + temp.high = simd4i(_high); + return temp; + } + + inline void load_aligned(const type* ptr) { _low.load_aligned(ptr); _high.load_aligned(ptr+4); } + inline void store_aligned(type* ptr) const { _low.store_aligned(ptr); _high.store_aligned(ptr+4); } + inline void load(const type* ptr) { _low.load(ptr); _high.load(ptr+4); } + inline void store(type* ptr) const { _low.store(ptr); _high.store(ptr+4); } + + inline unsigned int size() const { return 8; } + inline float operator[](unsigned int idx) const + { + if (idx < 4) + return _low[idx]; + else + return _high[idx-4]; + } + + inline const simd4f& low() const { return _low; } + inline const simd4f& high() const { return _high; } + + private: + simd4f _low, _high; + }; + + class simd8f_bool + { + public: + typedef float type; + + inline simd8f_bool() {} + inline simd8f_bool(const simd4f_bool& low_, const simd4f_bool& high_): _low(low_),_high(high_){} + + + inline const simd4f_bool& low() const { return _low; } + inline const simd4f_bool& high() const { return _high; } + private: + simd4f_bool _low,_high; + }; +#endif + +// ---------------------------------------------------------------------------------------- + + inline std::ostream& operator<<(std::ostream& out, const simd8f& item) + { + float temp[8]; + item.store(temp); + out << "(" << temp[0] << ", " << temp[1] << ", " << temp[2] << ", " << temp[3] << ", " + << temp[4] << ", " << temp[5] << ", " << temp[6] << ", " << temp[7] << ")"; + return out; + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f operator+ (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_add_ps(lhs, rhs); +#else + return simd8f(lhs.low()+rhs.low(), + lhs.high()+rhs.high()); +#endif + } + inline simd8f& operator+= (simd8f& lhs, const simd8f& rhs) + { lhs = lhs + rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd8f operator- (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_sub_ps(lhs, rhs); +#else + return simd8f(lhs.low()-rhs.low(), + lhs.high()-rhs.high()); +#endif + } + inline simd8f& operator-= (simd8f& lhs, const simd8f& rhs) + { lhs = lhs - rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd8f operator* (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_mul_ps(lhs, rhs); +#else + return simd8f(lhs.low()*rhs.low(), + lhs.high()*rhs.high()); +#endif + } + inline simd8f& operator*= (simd8f& lhs, const simd8f& rhs) + { lhs = lhs * rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd8f operator/ (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_div_ps(lhs, rhs); +#else + return simd8f(lhs.low()/rhs.low(), + lhs.high()/rhs.high()); +#endif + } + inline simd8f& operator/= (simd8f& lhs, const simd8f& rhs) + { lhs = lhs / rhs; return lhs; } + +// ---------------------------------------------------------------------------------------- + + inline simd8f_bool operator== (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_cmp_ps(lhs, rhs, 0); +#else + return simd8f_bool(lhs.low() ==rhs.low(), + lhs.high()==rhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f_bool operator!= (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_cmp_ps(lhs, rhs, 4); +#else + return simd8f_bool(lhs.low() !=rhs.low(), + lhs.high()!=rhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f_bool operator< (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_cmp_ps(lhs, rhs, 1); +#else + return simd8f_bool(lhs.low() <rhs.low(), + lhs.high()<rhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f_bool operator> (const simd8f& lhs, const simd8f& rhs) + { + return rhs < lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f_bool operator<= (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_cmp_ps(lhs, rhs, 2); +#else + return simd8f_bool(lhs.low() <=rhs.low(), + lhs.high()<=rhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f_bool operator>= (const simd8f& lhs, const simd8f& rhs) + { + return rhs <= lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f min (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_min_ps(lhs, rhs); +#else + return simd8f(min(lhs.low(), rhs.low()), + min(lhs.high(),rhs.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f max (const simd8f& lhs, const simd8f& rhs) + { +#ifdef DLIB_HAVE_AVX + return _mm256_max_ps(lhs, rhs); +#else + return simd8f(max(lhs.low(), rhs.low()), + max(lhs.high(),rhs.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f reciprocal (const simd8f& item) + { +#ifdef DLIB_HAVE_AVX + return _mm256_rcp_ps(item); +#else + return simd8f(reciprocal(item.low()), + reciprocal(item.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f reciprocal_sqrt (const simd8f& item) + { +#ifdef DLIB_HAVE_AVX + return _mm256_rsqrt_ps(item); +#else + return simd8f(reciprocal_sqrt(item.low()), + reciprocal_sqrt(item.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline float sum(const simd8f& item) + { +#ifdef DLIB_HAVE_AVX + simd8f temp = _mm256_hadd_ps(item,item); + simd8f temp2 = _mm256_hadd_ps(temp,temp); + return _mm_cvtss_f32(_mm_add_ss(_mm256_castps256_ps128(temp2),_mm256_extractf128_ps(temp2,1))); +#else + return sum(item.low()+item.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline float dot(const simd8f& lhs, const simd8f& rhs) + { + return sum(lhs*rhs); + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f sqrt(const simd8f& item) + { +#ifdef DLIB_HAVE_AVX + return _mm256_sqrt_ps(item); +#else + return simd8f(sqrt(item.low()), + sqrt(item.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f ceil(const simd8f& item) + { +#ifdef DLIB_HAVE_AVX + return _mm256_ceil_ps(item); +#else + return simd8f(ceil(item.low()), + ceil(item.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8f floor(const simd8f& item) + { +#ifdef DLIB_HAVE_AVX + return _mm256_floor_ps(item); +#else + return simd8f(floor(item.low()), + floor(item.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + // perform cmp ? a : b + inline simd8f select(const simd8f_bool& cmp, const simd8f& a, const simd8f& b) + { +#ifdef DLIB_HAVE_AVX + return _mm256_blendv_ps(b,a,cmp); +#else + return simd8f(select(cmp.low(), a.low(), b.low()), + select(cmp.high(), a.high(), b.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_sIMD8F_Hh_ + diff --git a/ml/dlib/dlib/simd/simd8i.h b/ml/dlib/dlib/simd/simd8i.h new file mode 100644 index 000000000..18c06ec7e --- /dev/null +++ b/ml/dlib/dlib/simd/simd8i.h @@ -0,0 +1,339 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_sIMD8I_Hh_ +#define DLIB_sIMD8I_Hh_ + +#include "simd_check.h" +#include "../uintn.h" + +namespace dlib +{ + +#ifdef DLIB_HAVE_AVX + class simd8i + { + public: + typedef int32 type; + + inline simd8i() {} + inline simd8i(int32 f) { x = _mm256_set1_epi32(f); } + inline simd8i(int32 r0, int32 r1, int32 r2, int32 r3, + int32 r4, int32 r5, int32 r6, int32 r7 ) + { x = _mm256_setr_epi32(r0,r1,r2,r3,r4,r5,r6,r7); } + + inline simd8i(const __m256i& val):x(val) {} + + inline simd8i(const simd4i& low, const simd4i& high) + { + x = _mm256_insertf128_si256(_mm256_castsi128_si256(low),high,1); + } + + inline simd8i& operator=(const __m256i& val) + { + x = val; + return *this; + } + + inline operator __m256i() const { return x; } + + inline void load_aligned(const type* ptr) { x = _mm256_load_si256((const __m256i*)ptr); } + inline void store_aligned(type* ptr) const { _mm256_store_si256((__m256i*)ptr, x); } + inline void load(const type* ptr) { x = _mm256_loadu_si256((const __m256i*)ptr); } + inline void store(type* ptr) const { _mm256_storeu_si256((__m256i*)ptr, x); } + + inline simd4i low() const { return _mm256_castsi256_si128(x); } + inline simd4i high() const { return _mm256_extractf128_si256(x,1); } + + inline unsigned int size() const { return 8; } + inline int32 operator[](unsigned int idx) const + { + int32 temp[8]; + store(temp); + return temp[idx]; + } + + private: + __m256i x; + }; +#else + class simd8i + { + public: + typedef int32 type; + + inline simd8i() {} + inline simd8i(const simd4i& low_, const simd4i& high_): _low(low_),_high(high_){} + inline simd8i(int32 f) :_low(f),_high(f) {} + inline simd8i(int32 r0, int32 r1, int32 r2, int32 r3, int32 r4, int32 r5, int32 r6, int32 r7) : + _low(r0,r1,r2,r3), _high(r4,r5,r6,r7) {} + + struct rawarray + { + simd4i low, high; + }; + inline simd8i(const rawarray& a) + { + _low = a.low; + _high = a.high; + } + + inline void load_aligned(const type* ptr) { _low.load_aligned(ptr); _high.load_aligned(ptr+4); } + inline void store_aligned(type* ptr) const { _low.store_aligned(ptr); _high.store_aligned(ptr+4); } + inline void load(const type* ptr) { _low.load(ptr); _high.load(ptr+4); } + inline void store(type* ptr) const { _low.store(ptr); _high.store(ptr+4); } + + inline unsigned int size() const { return 8; } + inline int32 operator[](unsigned int idx) const + { + if (idx < 4) + return _low[idx]; + else + return _high[idx-4]; + } + + inline const simd4i& low() const { return _low; } + inline const simd4i& high() const { return _high; } + + private: + simd4i _low, _high; + }; + +#endif + +// ---------------------------------------------------------------------------------------- + + inline std::ostream& operator<<(std::ostream& out, const simd8i& item) + { + int32 temp[8]; + item.store(temp); + out << "(" << temp[0] << ", " << temp[1] << ", " << temp[2] << ", " << temp[3] << ", " + << temp[4] << ", " << temp[5] << ", " << temp[6] << ", " << temp[7] << ")"; + return out; + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator+ (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_add_epi32(lhs, rhs); +#else + return simd8i(lhs.low()+rhs.low(), + lhs.high()+rhs.high()); +#endif + } + inline simd8i& operator+= (simd8i& lhs, const simd8i& rhs) + { return lhs = lhs + rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator- (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_sub_epi32(lhs, rhs); +#else + return simd8i(lhs.low()-rhs.low(), + lhs.high()-rhs.high()); +#endif + } + inline simd8i& operator-= (simd8i& lhs, const simd8i& rhs) + { return lhs = lhs - rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator* (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_mullo_epi32(lhs, rhs); +#else + return simd8i(lhs.low()*rhs.low(), + lhs.high()*rhs.high()); +#endif + } + inline simd8i& operator*= (simd8i& lhs, const simd8i& rhs) + { return lhs = lhs * rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator& (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_and_si256(lhs, rhs); +#else + return simd8i(lhs.low()&rhs.low(), + lhs.high()&rhs.high()); +#endif + } + inline simd8i& operator&= (simd8i& lhs, const simd8i& rhs) + { return lhs = lhs & rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator| (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_or_si256(lhs, rhs); +#else + return simd8i(lhs.low()|rhs.low(), + lhs.high()|rhs.high()); +#endif + } + inline simd8i& operator|= (simd8i& lhs, const simd8i& rhs) + { return lhs = lhs | rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator^ (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_xor_si256(lhs, rhs); +#else + return simd8i(lhs.low()^rhs.low(), + lhs.high()^rhs.high()); +#endif + } + inline simd8i& operator^= (simd8i& lhs, const simd8i& rhs) + { return lhs = lhs ^ rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator~ (const simd8i& lhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_xor_si256(lhs, _mm256_set1_epi32(0xFFFFFFFF)); +#else + return simd8i(~lhs.low(), ~lhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator<< (const simd8i& lhs, const int& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_sll_epi32(lhs,_mm_cvtsi32_si128(rhs)); +#else + return simd8i(lhs.low()<<rhs, + lhs.high()<<rhs); +#endif + } + inline simd8i& operator<<= (simd8i& lhs, const int& rhs) + { return lhs = lhs << rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator>> (const simd8i& lhs, const int& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_sra_epi32(lhs,_mm_cvtsi32_si128(rhs)); +#else + return simd8i(lhs.low()>>rhs, + lhs.high()>>rhs); +#endif + } + inline simd8i& operator>>= (simd8i& lhs, const int& rhs) + { return lhs = lhs >> rhs; return lhs;} + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator== (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_cmpeq_epi32(lhs, rhs); +#else + return simd8i(lhs.low()==rhs.low(), + lhs.high()==rhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator!= (const simd8i& lhs, const simd8i& rhs) + { + return ~(lhs==rhs); + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator> (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_cmpgt_epi32(lhs, rhs); +#else + return simd8i(lhs.low()>rhs.low(), + lhs.high()>rhs.high()); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator< (const simd8i& lhs, const simd8i& rhs) + { + return rhs > lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator<= (const simd8i& lhs, const simd8i& rhs) + { + return ~(lhs > rhs); + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i operator>= (const simd8i& lhs, const simd8i& rhs) + { + return rhs <= lhs; + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i min (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_min_epi32(lhs, rhs); +#else + return simd8i(min(lhs.low(),rhs.low()), + min(lhs.high(),rhs.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline simd8i max (const simd8i& lhs, const simd8i& rhs) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_max_epi32(lhs, rhs); +#else + return simd8i(max(lhs.low(),rhs.low()), + max(lhs.high(),rhs.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + + inline int32 sum(const simd8i& item) + { + return sum(item.low()+item.high()); + } + +// ---------------------------------------------------------------------------------------- + + // perform cmp ? a : b + inline simd8i select(const simd8i& cmp, const simd8i& a, const simd8i& b) + { +#ifdef DLIB_HAVE_AVX2 + return _mm256_blendv_epi8(b,a,cmp); +#else + return simd8i(select(cmp.low(), a.low(), b.low()), + select(cmp.high(), a.high(), b.high())); +#endif + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_sIMD8I_Hh_ + + diff --git a/ml/dlib/dlib/simd/simd_check.h b/ml/dlib/dlib/simd/simd_check.h new file mode 100644 index 000000000..c4ca0c3b8 --- /dev/null +++ b/ml/dlib/dlib/simd/simd_check.h @@ -0,0 +1,177 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SIMd_CHECK_Hh_ +#define DLIB_SIMd_CHECK_Hh_ + +#include <array> +#include <iostream> + +//#define DLIB_DO_NOT_USE_SIMD + +// figure out which SIMD instructions we can use. +#ifndef DLIB_DO_NOT_USE_SIMD + #if defined(_MSC_VER) + #ifdef __AVX__ + #ifndef DLIB_HAVE_SSE2 + #define DLIB_HAVE_SSE2 + #endif + #ifndef DLIB_HAVE_SSE3 + #define DLIB_HAVE_SSE3 + #endif + #ifndef DLIB_HAVE_SSE41 + #define DLIB_HAVE_SSE41 + #endif + #ifndef DLIB_HAVE_AVX + #define DLIB_HAVE_AVX + #endif + #endif + #if (defined( _M_X64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2) && !defined(DLIB_HAVE_SSE2) + #define DLIB_HAVE_SSE2 + #endif + #else + #ifdef __SSE2__ + #ifndef DLIB_HAVE_SSE2 + #define DLIB_HAVE_SSE2 + #endif + #endif + #ifdef __SSSE3__ + #ifndef DLIB_HAVE_SSE3 + #define DLIB_HAVE_SSE3 + #endif + #endif + #ifdef __SSE4_1__ + #ifndef DLIB_HAVE_SSE41 + #define DLIB_HAVE_SSE41 + #endif + #endif + #ifdef __AVX__ + #ifndef DLIB_HAVE_AVX + #define DLIB_HAVE_AVX + #endif + #endif + #ifdef __AVX2__ + #ifndef DLIB_HAVE_AVX2 + #define DLIB_HAVE_AVX2 + #endif + #endif + #ifdef __ALTIVEC__ + #ifndef DLIB_HAVE_ALTIVEC + #define DLIB_HAVE_ALTIVEC + #endif + #endif + #ifdef __VSX__ + #ifndef DLIB_HAVE_VSX + #define DLIB_HAVE_VSX + #endif + #endif + #ifdef __VEC__ // __VEC__ = 10206 + #ifndef DLIB_HAVE_POWER_VEC // vector and vec_ intrinsics + #define DLIB_HAVE_POWER_VEC + #endif + #endif + #ifdef __ARM_NEON + #ifndef DLIB_HAVE_NEON + #define DLIB_HAVE_NEON + #endif + #endif + #endif +#endif + + +// ---------------------------------------------------------------------------------------- + + +#ifdef DLIB_HAVE_ALTIVEC +#include <altivec.h> +#endif + +#ifdef DLIB_HAVE_SSE2 + #include <xmmintrin.h> + #include <emmintrin.h> + #include <mmintrin.h> +#endif +#ifdef DLIB_HAVE_SSE3 + #include <pmmintrin.h> // SSE3 + #include <tmmintrin.h> +#endif +#ifdef DLIB_HAVE_SSE41 + #include <smmintrin.h> // SSE4 +#endif +#ifdef DLIB_HAVE_AVX + #include <immintrin.h> // AVX +#endif +#ifdef DLIB_HAVE_AVX2 + #include <immintrin.h> // AVX +// #include <avx2intrin.h> +#endif +#ifdef DLIB_HAVE_NEON + #include <arm_neon.h> // ARM NEON +#endif + +// ---------------------------------------------------------------------------------------- +// Define functions to check, at runtime, what instructions are available + +#if defined(_MSC_VER) && (defined(_M_I86) || defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) ) + #include <intrin.h> + + inline std::array<unsigned int,4> cpuid(int function_id) + { + std::array<unsigned int,4> info; + // Load EAX, EBX, ECX, EDX into info + __cpuid((int*)info.data(), function_id); + return info; + } + +#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__i686__) || defined(__amd64__) || defined(__x86_64__)) + #include <cpuid.h> + + inline std::array<unsigned int,4> cpuid(int function_id) + { + std::array<unsigned int,4> info; + // Load EAX, EBX, ECX, EDX into info + __cpuid(function_id, info[0], info[1], info[2], info[3]); + return info; + } + +#else + + inline std::array<unsigned int,4> cpuid(int) + { + return std::array<unsigned int,4>{}; + } + +#endif + + inline bool cpu_has_sse2_instructions() { return 0!=(cpuid(1)[3]&(1<<26)); } + inline bool cpu_has_sse3_instructions() { return 0!=(cpuid(1)[2]&(1<<0)); } + inline bool cpu_has_sse41_instructions() { return 0!=(cpuid(1)[2]&(1<<19)); } + inline bool cpu_has_sse42_instructions() { return 0!=(cpuid(1)[2]&(1<<20)); } + inline bool cpu_has_avx_instructions() { return 0!=(cpuid(1)[2]&(1<<28)); } + inline bool cpu_has_avx2_instructions() { return 0!=(cpuid(7)[1]&(1<<5)); } + inline bool cpu_has_avx512_instructions() { return 0!=(cpuid(7)[1]&(1<<16)); } + + inline void warn_about_unavailable_but_used_cpu_instructions() + { +#if defined(DLIB_HAVE_AVX2) + if (!cpu_has_avx2_instructions()) + std::cerr << "Dlib was compiled to use AVX2 instructions, but these aren't available on your machine." << std::endl; +#elif defined(DLIB_HAVE_AVX) + if (!cpu_has_avx_instructions()) + std::cerr << "Dlib was compiled to use AVX instructions, but these aren't available on your machine." << std::endl; +#elif defined(DLIB_HAVE_SSE41) + if (!cpu_has_sse41_instructions()) + std::cerr << "Dlib was compiled to use SSE41 instructions, but these aren't available on your machine." << std::endl; +#elif defined(DLIB_HAVE_SSE3) + if (!cpu_has_sse3_instructions()) + std::cerr << "Dlib was compiled to use SSE3 instructions, but these aren't available on your machine." << std::endl; +#elif defined(DLIB_HAVE_SSE2) + if (!cpu_has_sse2_instructions()) + std::cerr << "Dlib was compiled to use SSE2 instructions, but these aren't available on your machine." << std::endl; +#endif + } + +// ---------------------------------------------------------------------------------------- + +#endif // DLIB_SIMd_CHECK_Hh_ + + |