/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Anutosh Bhat * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_WASM_HPP #define XSIMD_WASM_HPP #include #include "../types/xsimd_wasm_register.hpp" namespace xsimd { template struct batch_bool_constant; template inline batch bitwise_cast(batch const& x) noexcept; template struct batch_constant; namespace kernel { using namespace types; // fwd template inline batch insert(batch const& self, T val, index, requires_arch) noexcept; template inline batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; template inline batch avg(batch const&, batch const&, requires_arch) noexcept; // abs template ::value && std::is_signed::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_abs(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_abs(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_abs(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_abs(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch abs(batch const& self, requires_arch) noexcept { return wasm_f32x4_abs(self); } template inline batch abs(batch const& self, requires_arch) noexcept { return wasm_f64x2_abs(self); } // add template ::value, void>::type> inline batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_add(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_add(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_add(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_add(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_add(self, other); } template inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_add(self, other); } // avgr template ::value, void>::type> inline batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_avgr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_avgr(self, other); } else { return avgr(self, other, generic {}); } } // avg template ::value, void>::type> inline batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, generic {}); } } // all template inline bool all(batch_bool const& self, requires_arch) noexcept { return wasm_i32x4_bitmask(self) == 0x0F; } template inline bool all(batch_bool const& self, requires_arch) noexcept { return wasm_i64x2_bitmask(self) == 0x03; } template ::value, void>::type> inline bool all(batch_bool const& self, requires_arch) noexcept { return wasm_i8x16_bitmask(self) == 0xFFFF; } // any template inline bool any(batch_bool const& self, requires_arch) noexcept { return wasm_i32x4_bitmask(self) != 0; } template inline bool any(batch_bool const& self, requires_arch) noexcept { return wasm_i64x2_bitmask(self) != 0; } template ::value, void>::type> inline bool any(batch_bool const& self, requires_arch) noexcept { return wasm_i8x16_bitmask(self) != 0; } // batch_bool_cast template inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_and(self, other); } template inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_and(self, other); } // bitwise_andnot template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_andnot(self, other); } template inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_andnot(self, other); } // bitwise_cast template inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } // bitwise_or template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_or(self, other); } template inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_or(self, other); } // bitwise_lshift template ::value, void>::type> inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shl(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } // bitwise_rshift template ::value, void>::type> inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shr(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_u64x2_shr(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } } // bitwise_not template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return wasm_v128_not(self); } template inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return wasm_v128_not(self); } // bitwise_xor template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_xor(self, other); } template inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_xor(self, other); } // broadcast template batch inline broadcast(float val, requires_arch) noexcept { return wasm_f32x4_splat(val); } template ::value, void>::type> inline batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_splat(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_splat(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_splat(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_splat(val); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch broadcast(double val, requires_arch) noexcept { return wasm_f64x2_splat(val); } // ceil template inline batch ceil(batch const& self, requires_arch) noexcept { return wasm_f32x4_ceil(self); } template inline batch ceil(batch const& self, requires_arch) noexcept { return wasm_f64x2_ceil(self); } // div template inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_div(self, other); } template inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_div(self, other); } // eq template inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_eq(self, other); } template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_i32x4_eq(self, other); } template ::value, void>::type> inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_eq(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template ::value, void>::type> inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_eq(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_eq(self, other); } template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_i64x2_eq(self, other); } // fast_cast namespace detail { template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return wasm_f32x4_convert_i32x4(self); } template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to wasm v128_t xH = wasm_u64x2_shr(x, 32); xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); // 2^84 v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); // 2^84 + 2^52 return wasm_f64x2_add(f, xL); } template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to wasm v128_t xH = wasm_i32x4_shr(x, 16); xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); // 3*2^67 v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); // 3*2^67 + 2^52 return wasm_f64x2_add(f, xL); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return wasm_i32x4_make( static_cast(wasm_f32x4_extract_lane(self, 0)), static_cast(wasm_f32x4_extract_lane(self, 1)), static_cast(wasm_f32x4_extract_lane(self, 2)), static_cast(wasm_f32x4_extract_lane(self, 3))); } } // floor template inline batch floor(batch const& self, requires_arch) noexcept { return wasm_f32x4_floor(self); } template inline batch floor(batch const& self, requires_arch) noexcept { return wasm_f64x2_floor(self); } // from_mask template inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint32_t lut[][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }; assert(!(mask & ~0xFul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut[mask]); } template inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; assert(!(mask & ~0x3ul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut[mask]); } template ::value, void>::type> inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000, 0x000000000000FFFF, 0x00000000FFFF0000, 0x00000000FFFFFFFF, 0x0000FFFF00000000, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF, 0xFFFF000000000000, 0xFFFF00000000FFFF, 0xFFFF0000FFFF0000, 0xFFFF0000FFFFFFFF, 0xFFFFFFFF00000000, 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF, }; alignas(A::alignment()) static const uint32_t lut32[] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, }; alignas(A::alignment()) static const uint32_t lut16[][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }; alignas(A::alignment()) static const uint64_t lut8[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(!(mask & ~0xFFFF) && "inbound mask"); return wasm_i32x4_make(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(!(mask & ~0xFF) && "inbound mask"); return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { assert(!(mask & ~0xFul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut16[mask]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { assert(!(mask & ~0x3ul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut8[mask]); } } // ge template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_ge(self, other); } template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_ge(self, other); } // gt template inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_gt(self, other); } template ::value, void>::type> inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_gt(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_gt(self, other); } else { return gt(self, other, generic {}); } } } template inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_gt(self, other); } // haddp template inline batch haddp(batch const* row, requires_arch) noexcept { v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5); v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7); v128_t tmp2 = wasm_i32x4_shuffle(row[2], row[3], 2, 6, 3, 7); tmp0 = wasm_f32x4_add(tmp0, tmp1); tmp1 = wasm_i32x4_shuffle(row[2], row[3], 0, 4, 1, 5); tmp1 = wasm_f32x4_add(tmp1, tmp2); tmp2 = wasm_i32x4_shuffle(tmp1, tmp0, 6, 7, 2, 3); tmp0 = wasm_i32x4_shuffle(tmp0, tmp1, 0, 1, 4, 5); return wasm_f32x4_add(tmp0, tmp2); } template inline batch haddp(batch const* row, requires_arch) noexcept { return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2), wasm_i64x2_shuffle(row[0], row[1], 1, 3)); } // insert template inline batch insert(batch const& self, float val, index pos, requires_arch) noexcept { return wasm_f32x4_replace_lane(self, pos, val); } template ::value, void>::type> inline batch insert(batch const& self, T val, index pos, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_replace_lane(self, pos, val); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_u64x2_replace_lane(self, pos, val); } else { assert(false && "unsupported arch/op combination"); return {}; } } } template inline batch insert(batch const& self, double val, index pos, requires_arch) noexcept { return wasm_f64x2_replace_lane(self, pos, val); } // isnan template inline batch_bool isnan(batch const& self, requires_arch) noexcept { return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self)); } template inline batch_bool isnan(batch const& self, requires_arch) noexcept { return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self)); } // le template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_le(self, other); } template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_le(self, other); } // load_aligned template inline batch load_aligned(float const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } template ::value, void>::type> inline batch load_aligned(T const* mem, convert, requires_arch) noexcept { return wasm_v128_load((v128_t const*)mem); } template inline batch load_aligned(double const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } // load_complex namespace detail { template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) }; } template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) }; } } // load_unaligned template inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } template ::value, void>::type> inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return wasm_v128_load((v128_t const*)mem); } template inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } // lt template inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_lt(self, other); } template ::value, void>::type> inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_lt(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto xself = wasm_v128_xor(self, wasm_i64x2_splat(std::numeric_limits::lowest())); auto xother = wasm_v128_xor(other, wasm_i64x2_splat(std::numeric_limits::lowest())); v128_t tmp1 = wasm_i64x2_sub(xself, xother); v128_t tmp2 = wasm_v128_xor(xself, xother); v128_t tmp3 = wasm_v128_andnot(xself, xother); v128_t tmp4 = wasm_v128_andnot(tmp1, tmp2); v128_t tmp5 = wasm_v128_or(tmp3, tmp4); v128_t tmp6 = wasm_i32x4_shr(tmp5, 31); return wasm_i32x4_shuffle(tmp6, wasm_i32x4_splat(0), 1, 1, 3, 3); } else { assert(false && "unsupported arch/op combination"); return {}; } } } template inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_lt(self, other); } // mask template ::value, void>::type> inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_bitmask(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_bitmask(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_bitmask(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_bitmask(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { return wasm_i32x4_bitmask(self); } template inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { return wasm_i64x2_bitmask(self); } // max template inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_pmax(self, other); } template ::value, void>::type> inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } template inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_pmax(self, other); } // min template inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_pmin(self, other); } template ::value, void>::type> inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } template inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_pmin(self, other); } // mul template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_mul(self, other); } template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_mul(self, other); } // neg template ::value, void>::type> inline batch neg(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_neg(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_neg(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_neg(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_neg(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch neg(batch const& self, requires_arch) noexcept { return wasm_f32x4_neg(self); } template inline batch neg(batch const& self, requires_arch) noexcept { return wasm_f64x2_neg(self); } // neq template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_ne(self, other); } template ::value, void>::type> inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_f32x4_ne(self, other); } template ::value, void>::type> inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self == other); } template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_ne(self, other); } template inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_f64x2_ne(self, other); } // reciprocal template inline batch reciprocal(batch const& self, requires_arch) noexcept { v128_t one = wasm_f32x4_splat(1.0f); return wasm_f32x4_div(one, self); } template inline batch reciprocal(batch const& self, requires_arch) noexcept { v128_t one = wasm_f64x2_splat(1.0); return wasm_f64x2_div(one, self); } // reduce_add template inline float reduce_add(batch const& self, requires_arch) noexcept { v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3)); v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4); v128_t tmp2 = wasm_f32x4_add(tmp0, tmp1); v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3); return wasm_f32x4_extract_lane(tmp3, 0); } template ::value, void>::type> inline T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); v128_t tmp1 = wasm_i32x4_add(self, tmp0); v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0); v128_t tmp3 = wasm_i32x4_add(tmp1, tmp2); return wasm_i32x4_extract_lane(tmp3, 0); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); v128_t tmp1 = wasm_i64x2_add(self, tmp0); return wasm_i64x2_extract_lane(tmp1, 0); } else { return hadd(self, generic {}); } } template inline double reduce_add(batch const& self, requires_arch) noexcept { v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3); v128_t tmp1 = wasm_f64x2_add(self, tmp0); v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1); return wasm_f64x2_extract_lane(tmp2, 0); } // rsqrt template inline batch rsqrt(batch const& self, requires_arch) noexcept { v128_t one = wasm_f32x4_splat(1.0f); return wasm_f32x4_div(one, wasm_f32x4_sqrt(self)); } template inline batch rsqrt(batch const& self, requires_arch) noexcept { v128_t one = wasm_f64x2_splat(1.0); return wasm_f64x2_div(one, wasm_f64x2_sqrt(self)); } // slide_left template inline batch slide_left(batch const& x, requires_arch) noexcept { return wasm_i8x16_shuffle( wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 17 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 18 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 19 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 20 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 21 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 22 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 23 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 24 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 25 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 26 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 27 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 28 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 29 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 30 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 31 - ((N) & 0xF)); } // slide_right template inline batch slide_right(batch const& x, requires_arch) noexcept { return wasm_i8x16_shuffle( x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 1, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 2, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 3, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 4, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 5, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 6, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 7, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 8, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 9, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 10, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 11, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 12, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 13, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 14, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 15); } // sadd template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_add_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_add_sat(self, other); } else { return sadd(self, other, generic {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_add_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_add_sat(self, other); } else { return sadd(self, other, generic {}); } } } // select template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } template ::value, void>::type> inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } template ::value, void>::type> inline batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, wasm {}); } template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } // shuffle template inline batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3); } template inline batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return wasm_i64x2_shuffle(x, y, I0, I1); } // set template inline batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return wasm_f32x4_make(values...); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1) noexcept { return wasm_i64x2_make(v0, v1); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return wasm_i32x4_make(v0, v1, v2, v3); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template inline batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return wasm_f64x2_make(values...); } template ::value, void>::type> inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } // ssub template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_sub_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_sub_sat(self, other); } else { return ssub(self, other, generic {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_sub_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_sub_sat(self, other); } else { return ssub(self, other, generic {}); } } } // store_aligned template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } // store_complex namespace detail { // complex_low template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5); } // complex_high template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7); } template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2); } template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3); } } // store_unaligned template inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } template ::value, void>::type> inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template ::value, void>::type> inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } // sub template inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_sub(self, other); } template ::value, void>::type> inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_sub(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_sub(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_sub(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_sub(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_sub(self, other); } // sqrt template inline batch sqrt(batch const& val, requires_arch) noexcept { return wasm_f32x4_sqrt(val); } template inline batch sqrt(batch const& val, requires_arch) noexcept { return wasm_f64x2_sqrt(val); } // swizzle template inline batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); } template inline batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i64x2_shuffle(self, self, V0, V1); } template inline batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i64x2_shuffle(self, self, V0, V1); } template inline batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } template inline batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); } template inline batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } template inline batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7); } template inline batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } template inline batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15); } template inline batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } // trunc template inline batch trunc(batch const& self, requires_arch) noexcept { return wasm_f32x4_trunc(self); } template inline batch trunc(batch const& self, requires_arch) noexcept { return wasm_f64x2_trunc(self); } // zip_hi template inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7); } template ::value, void>::type> inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shuffle(self, other, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shuffle(self, other, 4, 12, 5, 13, 6, 14, 7, 15); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shuffle(self, other, 1, 3); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i64x2_shuffle(self, other, 1, 3); } // zip_lo template inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5); } template ::value, void>::type> inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shuffle(self, other, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shuffle(self, other, 0, 8, 1, 9, 2, 10, 3, 11); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shuffle(self, other, 0, 2); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i64x2_shuffle(self, other, 0, 2); } } } #endif