diff options
Diffstat (limited to 'ml/dlib/dlib/simd/simd4f.h')
-rw-r--r-- | ml/dlib/dlib/simd/simd4f.h | 685 |
1 files changed, 0 insertions, 685 deletions
diff --git a/ml/dlib/dlib/simd/simd4f.h b/ml/dlib/dlib/simd/simd4f.h deleted file mode 100644 index 2bfadd23f..000000000 --- a/ml/dlib/dlib/simd/simd4f.h +++ /dev/null @@ -1,685 +0,0 @@ -// Copyright (C) 2013 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_sIMD4F_Hh_ -#define DLIB_sIMD4F_Hh_ - -#include "simd_check.h" -#include "simd4i.h" -#include <cmath> -#include <iostream> - -namespace dlib -{ - -#ifdef DLIB_HAVE_SSE2 - class simd4f - { - public: - typedef float type; - - inline simd4f() {} - inline simd4f(float f) { x = _mm_set1_ps(f); } - inline simd4f(float r0, float r1, float r2, float r3) { x = _mm_setr_ps(r0,r1,r2,r3); } - inline simd4f(const __m128& val):x(val) {} - inline simd4f(const simd4i& val):x(_mm_cvtepi32_ps(val)) {} - - inline simd4f& operator=(const simd4i& val) - { - x = simd4f(val); - return *this; - } - - inline simd4f& operator=(const float& val) - { - x = simd4f(val); - return *this; - } - - inline simd4f& operator=(const __m128& val) - { - x = val; - return *this; - } - - inline operator __m128() const { return x; } - - // truncate to 32bit integers - inline operator __m128i() const { return _mm_cvttps_epi32(x); } - - inline void load_aligned(const type* ptr) { x = _mm_load_ps(ptr); } - inline void store_aligned(type* ptr) const { _mm_store_ps(ptr, x); } - inline void load(const type* ptr) { x = _mm_loadu_ps(ptr); } - inline void store(type* ptr) const { _mm_storeu_ps(ptr, x); } - - inline unsigned int size() const { return 4; } - inline float operator[](unsigned int idx) const - { - float temp[4]; - store(temp); - return temp[idx]; - } - - private: - __m128 x; - }; - - class simd4f_bool - { - public: - typedef float type; - - inline simd4f_bool() {} - inline simd4f_bool(const __m128& val):x(val) {} - - inline simd4f_bool& operator=(const __m128& val) - { - x = val; - return *this; - } - - inline operator __m128() const { return x; } - - - private: - __m128 x; - }; - -#elif defined(DLIB_HAVE_VSX) - - class simd4f - { - typedef union { - vector float v; - float x[4]; - } v4f; - - v4f x; - - public: - inline simd4f() : x{0,0,0,0} {} - inline simd4f(const simd4f& v) : x(v.x) { } - inline simd4f(const vector float& v) : x{v} { } - - inline simd4f(const simd4i& v) { - x.x[0]=v[0]; x.x[1]=v[1]; x.x[2]=v[2]; x.x[3]=v[3]; - } - - - inline simd4f(float f) : x{f,f,f,f} { } - inline simd4f(float r0, float r1, float r2, float r3) - : x{r0,r1,r2,r3} { } - - inline simd4f& operator=(const simd4f& v) { x = v.x; return *this; } - inline simd4f& operator=(const float& v) { *this = simd4f(v); return *this; } - - inline vector float operator() () const { return x.v; } - inline float operator[](unsigned int idx) const { return x.x[idx]; } - - inline void load_aligned(const float* ptr) { x.v = vec_ld(0, ptr); } - inline void store_aligned(float* ptr) const { vec_st(x.v, 0, ptr); } - inline void load(const float* ptr) { x.v = vec_vsx_ld(0, ptr); } - inline void store(float* ptr) const { vec_vsx_st(x.v, 0, ptr); } - - - // truncate to 32bit integers - inline operator simd4i::rawarray() const - { - simd4i::rawarray temp; - temp.v.x[0] = x.x[0]; - temp.v.x[1] = x.x[1]; - temp.v.x[2] = x.x[2]; - temp.v.x[3] = x.x[3]; - return temp; - } - }; - - typedef simd4i simd4f_bool; - -#elif defined(DLIB_HAVE_NEON) - - class simd4f - { - public: - typedef float type; - - inline simd4f() {} - inline simd4f(float f) { x = vdupq_n_f32(f); } - inline simd4f(float r0, float r1, float r2, float r3) - { - float __attribute__ ((aligned (16))) data[4] = { r0, r1, r2, r3 }; - x = vld1q_f32(data); - } - inline simd4f(const float32x4_t& val):x(val) {} - inline simd4f(const simd4i& val):x(vcvtq_f32_s32(val)) {} - - inline simd4f& operator=(const simd4i& val) - { - x = simd4f(val); - return *this; - } - - inline simd4f& operator=(const float& val) - { - x = simd4f(val); - return *this; - } - - inline simd4f& operator=(const float32x4_t& val) - { - x = val; - return *this; - } - - inline operator float32x4_t() const { return x; } - - // truncate to 32bit integers - inline operator int32x4_t() const { return vcvtq_s32_f32(x); } - - inline void load_aligned(const type* ptr) { x = vld1q_f32(ptr); } - inline void store_aligned(type* ptr) const { vst1q_f32(ptr, x); } - inline void load(const type* ptr) { x = vld1q_f32(ptr); } - inline void store(type* ptr) const { vst1q_f32(ptr, x); } - - inline unsigned int size() const { return 4; } - inline float operator[](unsigned int idx) const - { - float temp[4]; - store(temp); - return temp[idx]; - } - - private: - float32x4_t x; - }; - - - typedef simd4i simd4f_bool; -#else - class simd4f - { - public: - typedef float type; - - inline simd4f() {} - inline simd4f(float f) { x[0]=f; x[1]=f; x[2]=f; x[3]=f; } - inline simd4f(float r0, float r1, float r2, float r3) { x[0]=r0; x[1]=r1; x[2]=r2; x[3]=r3;} - inline simd4f(const simd4i& val) { x[0]=val[0]; x[1]=val[1]; x[2]=val[2]; x[3]=val[3];} - - // truncate to 32bit integers - inline operator simd4i::rawarray() const - { - simd4i::rawarray temp; - temp.a[0] = (int32)x[0]; - temp.a[1] = (int32)x[1]; - temp.a[2] = (int32)x[2]; - temp.a[3] = (int32)x[3]; - return temp; - } - - inline simd4f& operator=(const float& val) - { - *this = simd4f(val); - return *this; - } - - inline simd4f& operator=(const simd4i& val) - { - x[0] = val[0]; - x[1] = val[1]; - x[2] = val[2]; - x[3] = val[3]; - return *this; - } - - - inline void load_aligned(const type* ptr) - { - x[0] = ptr[0]; - x[1] = ptr[1]; - x[2] = ptr[2]; - x[3] = ptr[3]; - } - - inline void store_aligned(type* ptr) const - { - ptr[0] = x[0]; - ptr[1] = x[1]; - ptr[2] = x[2]; - ptr[3] = x[3]; - } - - inline void load(const type* ptr) - { - x[0] = ptr[0]; - x[1] = ptr[1]; - x[2] = ptr[2]; - x[3] = ptr[3]; - } - - inline void store(type* ptr) const - { - ptr[0] = x[0]; - ptr[1] = x[1]; - ptr[2] = x[2]; - ptr[3] = x[3]; - } - - inline unsigned int size() const { return 4; } - inline float operator[](unsigned int idx) const { return x[idx]; } - - private: - float x[4]; - }; - - - class simd4f_bool - { - public: - typedef float type; - - inline simd4f_bool() {} - inline simd4f_bool(bool r0, bool r1, bool r2, bool r3) { x[0]=r0; x[1]=r1; x[2]=r2; x[3]=r3;} - - inline bool operator[](unsigned int idx) const { return x[idx]; } - private: - bool x[4]; - }; - -#endif - -// ---------------------------------------------------------------------------------------- - - inline std::ostream& operator<<(std::ostream& out, const simd4f& item) - { - float temp[4]; - item.store(temp); - out << "(" << temp[0] << ", " << temp[1] << ", " << temp[2] << ", " << temp[3] << ")"; - return out; - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f operator+ (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_add_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_add(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return vaddq_f32(lhs, rhs); -#else - return simd4f(lhs[0]+rhs[0], - lhs[1]+rhs[1], - lhs[2]+rhs[2], - lhs[3]+rhs[3]); -#endif - } - inline simd4f& operator+= (simd4f& lhs, const simd4f& rhs) - { lhs = lhs + rhs; return lhs; } - -// ---------------------------------------------------------------------------------------- - - inline simd4f operator- (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_sub_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_sub(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return vsubq_f32(lhs, rhs); -#else - return simd4f(lhs[0]-rhs[0], - lhs[1]-rhs[1], - lhs[2]-rhs[2], - lhs[3]-rhs[3]); -#endif - } - inline simd4f& operator-= (simd4f& lhs, const simd4f& rhs) - { lhs = lhs - rhs; return lhs; } - -// ---------------------------------------------------------------------------------------- - - inline simd4f operator* (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_mul_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_mul(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return vmulq_f32(lhs, rhs); -#else - return simd4f(lhs[0]*rhs[0], - lhs[1]*rhs[1], - lhs[2]*rhs[2], - lhs[3]*rhs[3]); -#endif - } - inline simd4f& operator*= (simd4f& lhs, const simd4f& rhs) - { lhs = lhs * rhs; return lhs; } - -// ---------------------------------------------------------------------------------------- - - inline simd4f operator/ (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_div_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_div(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - float32x4_t reciprocal = vrecpeq_f32(rhs); - reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal); - float32x4_t result = vmulq_f32(lhs,reciprocal); - return result; -#else - return simd4f(lhs[0]/rhs[0], - lhs[1]/rhs[1], - lhs[2]/rhs[2], - lhs[3]/rhs[3]); -#endif - } - inline simd4f& operator/= (simd4f& lhs, const simd4f& rhs) - { lhs = lhs / rhs; return lhs; } - -// ---------------------------------------------------------------------------------------- - - inline simd4f_bool operator== (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_cmpeq_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_cmpeq(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return (int32x4_t)vceqq_f32(lhs, rhs); -#else - return simd4f_bool(lhs[0]==rhs[0], - lhs[1]==rhs[1], - lhs[2]==rhs[2], - lhs[3]==rhs[3]); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f_bool operator!= (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_cmpneq_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) || defined(DLIB_HAVE_NEON) - return ~(lhs==rhs); // simd4f_bool is simd4i typedef, can use ~ -#else - return simd4f_bool(lhs[0]!=rhs[0], - lhs[1]!=rhs[1], - lhs[2]!=rhs[2], - lhs[3]!=rhs[3]); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f_bool operator< (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_cmplt_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_cmplt(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return (int32x4_t)vcltq_f32(lhs, rhs); -#else - return simd4f_bool(lhs[0]<rhs[0], - lhs[1]<rhs[1], - lhs[2]<rhs[2], - lhs[3]<rhs[3]); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f_bool operator> (const simd4f& lhs, const simd4f& rhs) - { - return rhs < lhs; - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f_bool operator<= (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_cmple_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_cmple(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return (int32x4_t)vcleq_f32(lhs, rhs); -#else - return simd4f_bool(lhs[0]<=rhs[0], - lhs[1]<=rhs[1], - lhs[2]<=rhs[2], - lhs[3]<=rhs[3]); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f_bool operator>= (const simd4f& lhs, const simd4f& rhs) - { - return rhs <= lhs; - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f min (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_min_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_min(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return vminq_f32(lhs, rhs); -#else - return simd4f(std::min(lhs[0],rhs[0]), - std::min(lhs[1],rhs[1]), - std::min(lhs[2],rhs[2]), - std::min(lhs[3],rhs[3])); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f max (const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_max_ps(lhs, rhs); -#elif defined(DLIB_HAVE_VSX) - return vec_max(lhs(), rhs()); -#elif defined(DLIB_HAVE_NEON) - return vmaxq_f32(lhs, rhs); -#else - return simd4f(std::max(lhs[0],rhs[0]), - std::max(lhs[1],rhs[1]), - std::max(lhs[2],rhs[2]), - std::max(lhs[3],rhs[3])); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f reciprocal (const simd4f& item) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_rcp_ps(item); -#elif defined(DLIB_HAVE_VSX) - return vec_re(item()); -#elif defined(DLIB_HAVE_NEON) - float32x4_t estimate = vrecpeq_f32(item); - estimate = vmulq_f32(vrecpsq_f32(estimate , item), estimate ); - estimate = vmulq_f32(vrecpsq_f32(estimate , item), estimate ); - return estimate ; -#else - return simd4f(1.0f/item[0], - 1.0f/item[1], - 1.0f/item[2], - 1.0f/item[3]); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f reciprocal_sqrt (const simd4f& item) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_rsqrt_ps(item); -#elif defined(DLIB_HAVE_VSX) - return vec_rsqrt(item()); -#elif defined(DLIB_HAVE_NEON) - float32x4_t estimate = vrsqrteq_f32(item); - simd4f estimate2 = vmulq_f32(estimate, item); - estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate)); - return estimate; -#else - return simd4f(1.0f/std::sqrt(item[0]), - 1.0f/std::sqrt(item[1]), - 1.0f/std::sqrt(item[2]), - 1.0f/std::sqrt(item[3])); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline float dot(const simd4f& lhs, const simd4f& rhs); - inline float sum(const simd4f& item) - { -#ifdef DLIB_HAVE_SSE41 - return dot(simd4f(1), item); -#elif defined(DLIB_HAVE_SSE3) - simd4f temp = _mm_hadd_ps(item,item); - return _mm_cvtss_f32(_mm_hadd_ps(temp,temp)); -#elif defined(DLIB_HAVE_SSE2) && (!defined(_MSC_VER) || _MSC_VER!=1400) - simd4f temp = _mm_add_ps(item,_mm_movehl_ps(item,item)); - simd4f temp2 = _mm_shuffle_ps(temp,temp,1); - return _mm_cvtss_f32(_mm_add_ss(temp,temp2)); -#elif defined(DLIB_HAVE_NEON) - float32x2_t r = vadd_f32(vget_high_f32(item), vget_low_f32(item)); - return vget_lane_f32(vpadd_f32(r, r), 0); -#else - return item[0]+item[1]+item[2]+item[3]; -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline float dot(const simd4f& lhs, const simd4f& rhs) - { -#ifdef DLIB_HAVE_SSE41 - return _mm_cvtss_f32(_mm_dp_ps(lhs, rhs, 0xff)); -#else - return sum(lhs*rhs); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f sqrt(const simd4f& item) - { -#ifdef DLIB_HAVE_SSE2 - return _mm_sqrt_ps(item); -#elif defined(DLIB_HAVE_VSX) - return vec_sqrt(item()); -#elif defined(DLIB_HAVE_NEON) - float32x4_t q_step_0 = vrsqrteq_f32(item); - float32x4_t q_step_parm0 = vmulq_f32(item, q_step_0); - float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0); - float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0); - float32x4_t q_step_parm1 = vmulq_f32(item, q_step_1); - float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1); - float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1); - float32x4_t res3 = vmulq_f32(item, q_step_2); - - // normalize sqrt(0)=0 - uint32x4_t zcomp = vceqq_f32(vdupq_n_f32(0), item); - float32x4_t rcorr = vbslq_f32(zcomp, item, res3); - return rcorr; -#else - return simd4f(std::sqrt(item[0]), - std::sqrt(item[1]), - std::sqrt(item[2]), - std::sqrt(item[3])); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f ceil(const simd4f& item) - { -#ifdef DLIB_HAVE_SSE41 - return _mm_ceil_ps(item); -#elif defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_NEON) - float temp[4]; - item.store(temp); - temp[0] = std::ceil(temp[0]); - temp[1] = std::ceil(temp[1]); - temp[2] = std::ceil(temp[2]); - temp[3] = std::ceil(temp[3]); - simd4f temp2; - temp2.load(temp); - return temp2; -#elif defined(DLIB_HAVE_VSX) - return vec_ceil(item()); -#else - return simd4f(std::ceil(item[0]), - std::ceil(item[1]), - std::ceil(item[2]), - std::ceil(item[3])); -#endif - } - -// ---------------------------------------------------------------------------------------- - - inline simd4f floor(const simd4f& item) - { -#ifdef DLIB_HAVE_SSE41 - return _mm_floor_ps(item); -#elif defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_NEON) - float temp[4]; - item.store(temp); - temp[0] = std::floor(temp[0]); - temp[1] = std::floor(temp[1]); - temp[2] = std::floor(temp[2]); - temp[3] = std::floor(temp[3]); - simd4f temp2; - temp2.load(temp); - return temp2; -#elif defined(DLIB_HAVE_VSX) - return vec_floor(item()); -#else - return simd4f(std::floor(item[0]), - std::floor(item[1]), - std::floor(item[2]), - std::floor(item[3])); -#endif - } - -// ---------------------------------------------------------------------------------------- - - // perform cmp ? a : b - inline simd4f select(const simd4f_bool& cmp, const simd4f& a, const simd4f& b) - { -#ifdef DLIB_HAVE_SSE41 - return _mm_blendv_ps(b,a,cmp); -#elif defined(DLIB_HAVE_SSE2) - return _mm_or_ps(_mm_and_ps(cmp,a) , _mm_andnot_ps(cmp,b)); -#elif defined(DLIB_HAVE_NEON) - return vbslq_f32(cmp, a, b); -#else - return simd4f(cmp[0]?a[0]:b[0], - cmp[1]?a[1]:b[1], - cmp[2]?a[2]:b[2], - cmp[3]?a[3]:b[3]); -#endif - } - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_sIMD4F_Hh_ - |