// Copyright 2021 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Target-independent types/functions defined after target-specific ops. #include "hwy/base.h" // Define detail::Shuffle1230 etc, but only when viewing the current header; // normally this is included via highway.h, which includes ops/*.h. #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) #include "hwy/detect_targets.h" #include "hwy/ops/emu128-inl.h" #endif // HWY_IDE // Relies on the external include guard in highway.h. HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // The lane type of a vector type, e.g. float for Vec>. template using LaneType = decltype(GetLane(V())); // Vector type, e.g. Vec128 for CappedTag. Useful as the return // type of functions that do not take a vector argument, or as an argument type // if the function only has a template argument for D, or for explicit type // names instead of auto. This may be a built-in type. template using Vec = decltype(Zero(D())); // Mask type. Useful as the return type of functions that do not take a mask // argument, or as an argument type if the function only has a template argument // for D, or for explicit type names instead of auto. template using Mask = decltype(MaskFromVec(Zero(D()))); // Returns the closest value to v within [lo, hi]. template HWY_API V Clamp(const V v, const V lo, const V hi) { return Min(Max(lo, v), hi); } // CombineShiftRightBytes (and -Lanes) are not available for the scalar target, // and RVV has its own implementation of -Lanes. #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV template HWY_API VFromD CombineShiftRightLanes(D d, VFromD hi, VFromD lo) { constexpr size_t kBytes = kLanes * sizeof(TFromD); static_assert(kBytes < 16, "Shift count is per-block"); return CombineShiftRightBytes(d, hi, lo); } #endif // Returns lanes with the most significant bit set and all other bits zero. template HWY_API Vec SignBit(D d) { const RebindToUnsigned du; return BitCast(d, Set(du, SignMask>())); } // Returns quiet NaN. template HWY_API Vec NaN(D d) { const RebindToSigned di; // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus // mantissa MSB (to indicate quiet) would be sufficient. return BitCast(d, Set(di, LimitsMax>())); } // Returns positive infinity. template HWY_API Vec Inf(D d) { const RebindToUnsigned du; using T = TFromD; using TU = TFromD; const TU max_x2 = static_cast(MaxExponentTimes2()); return BitCast(d, Set(du, max_x2 >> 1)); } // ------------------------------ ZeroExtendResizeBitCast // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 // target is in emu128-inl.h, and the implementation of // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR namespace detail { #if HWY_HAVE_SCALABLE template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { using TFrom = TFromD; using TTo = TFromD; using TResize = UnsignedFromSize; const Repartition d_resize_from; const Repartition d_resize_to; return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, Lanes(d_resize_from)), ResizeBitCast(d_resize_to, v))); } #else // target that uses fixed-size vectors // Truncating or same-size resizing cast: same as ResizeBitCast template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { return ResizeBitCast(d_to, v); } // Resizing cast to vector that has twice the number of lanes of the source // vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Twice dt_from; return BitCast(d_to, ZeroExtendVector(dt_from, v)); } // Resizing cast to vector that has more than twice the number of lanes of the // source vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { using TFrom = TFromD; constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); const Repartition d_resize_to; return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), ResizeBitCast(d_resize_to, v))); } #endif // HWY_HAVE_SCALABLE } // namespace detail #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR template HWY_API VFromD ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD v) { return detail::ZeroExtendResizeBitCast(hwy::SizeTag(), hwy::SizeTag(), d_to, d_from, v); } // ------------------------------ SafeFillN template > HWY_API void SafeFillN(const size_t num, const T value, D d, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = value; } #else BlendedStore(Set(d, value), FirstN(d, num), d, to); #endif } // ------------------------------ SafeCopyN template > HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = from[i]; } #else const Mask mask = FirstN(d, num); BlendedStore(MaskedLoad(mask, d, from), mask, d, to); #endif } // ------------------------------ BitwiseIfThenElse #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return Or(And(mask, yes), AndNot(mask, no)); } #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE // "Include guard": skip if native instructions are available. The generic // implementation is currently shared between x86_* and wasm_*, and is too large // to duplicate. #if HWY_IDE || \ (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED #else #define HWY_NATIVE_LOAD_STORE_INTERLEAVED #endif // ------------------------------ LoadInterleaved2 template HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { const VFromD A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] const VFromD B = LoadU(d, unaligned + Lanes(d)); v0 = ConcatEven(d, B, A); v1 = ConcatOdd(d, B, A); } template HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); } // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) namespace detail { #if HWY_IDE template HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { return a; } template HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { return a; } template HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { return a; } #endif // HWY_IDE // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& A, VFromD& B, VFromD& C) { constexpr size_t kN = MaxLanes(d); A = LoadU(d, unaligned + 0 * kN); B = LoadU(d, unaligned + 1 * kN); C = LoadU(d, unaligned + 2 * kN); } } // namespace detail template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; using V = VFromD; // Compact notation so these fit on one line: 12 := v1[2]. V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. constexpr uint8_t Z = 0x80; alignas(16) static constexpr uint8_t kIdx_v0A[16] = { 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v0B[16] = { Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v0C[16] = { Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13}; alignas(16) static constexpr uint8_t kIdx_v1A[16] = { 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v1B[16] = { Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v1C[16] = { Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14}; alignas(16) static constexpr uint8_t kIdx_v2A[16] = { 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v2B[16] = { Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v2C[16] = { Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15}; const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } // 8-bit lanes x8 template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; using V = VFromD; V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. constexpr uint8_t Z = 0x80; alignas(16) static constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5}; alignas(16) static constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6}; alignas(16) static constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7}; const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } // 16-bit lanes x8 template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; const Repartition du8; using V = VFromD; V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. Same as above, // but each element of the array contains a byte index for a byte of a lane. constexpr uint8_t Z = 0x80; alignas(16) static constexpr uint8_t kIdx_v0A[16] = { 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v0B[16] = { Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v0C[16] = { Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B}; alignas(16) static constexpr uint8_t kIdx_v1A[16] = { 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v1B[16] = { Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v1C[16] = { Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D}; alignas(16) static constexpr uint8_t kIdx_v2A[16] = { 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v2B[16] = { Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z}; alignas(16) static constexpr uint8_t kIdx_v2C[16] = { Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F}; const V v0L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v0A))); const V v0M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v0B))); const V v0U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v0C))); const V v1L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v1A))); const V v1M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v1B))); const V v1U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v1C))); const V v2L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v2A))); const V v2M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v2B))); const V v2U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v2C))); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { using V = VFromD; V A; // v0[1] v2[0] v1[0] v0[0] V B; // v1[2] v0[2] v2[1] v1[1] V C; // v2[3] v1[3] v0[3] v2[2] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); const V vxx_02_03_xx = OddEven(C, B); v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx); // Shuffle2301 takes the upper/lower halves of the output from one input, so // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use // OddEven because it may have higher throughput than Shuffle. const V vxx_xx_10_11 = OddEven(A, B); const V v12_13_xx_xx = OddEven(B, C); v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx); const V vxx_20_21_xx = OddEven(B, A); v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C); } template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { VFromD A; // v1[0] v0[0] VFromD B; // v0[1] v2[0] VFromD C; // v2[1] v1[1] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); v0 = OddEven(B, A); v1 = CombineShiftRightBytes)>(d, C, A); v2 = OddEven(C, B); } template , HWY_IF_LANES_D(D, 1)> HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); } // ------------------------------ LoadInterleaved4 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& vA, VFromD& vB, VFromD& vC, VFromD& vD) { constexpr size_t kN = MaxLanes(d); vA = LoadU(d, unaligned + 0 * kN); vB = LoadU(d, unaligned + 1 * kN); vC = LoadU(d, unaligned + 2 * kN); vD = LoadU(d, unaligned + 3 * kN); } } // namespace detail template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { const Repartition d64; using V64 = VFromD; using V = VFromD; // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. // Here int[i] means the four interleaved values of the i-th 4-tuple and // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). V vA; // int[13..10] int[3..0] V vB; // int[17..14] int[7..4] V vC; // int[1b..18] int[b..8] V vD; // int[1f..1c] int[f..c] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); // For brevity, the comments only list the lower block (upper = lower + 0x10) const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a] const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { // In the last step, we interleave by half of the block size, which is usually // 8 bytes but half that for 8-bit x8 vectors. using TW = hwy::UnsignedFromSize; const Repartition dw; using VW = VFromD; // (Comments are for 256-bit vectors.) // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD. VFromD vA; // v3210[9]v3210[8] v3210[1]v3210[0] VFromD vB; // v3210[b]v3210[a] v3210[3]v3210[2] VFromD vC; // v3210[d]v3210[c] v3210[5]v3210[4] VFromD vD; // v3210[f]v3210[e] v3210[7]v3210[6] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); const VFromD va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0] const VFromD vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4] const VFromD vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1] const VFromD vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5] const VW v10_b830 = // v10[b..8] v10[3..0] BitCast(dw, InterleaveLower(d, va820, vb931)); const VW v10_fc74 = // v10[f..c] v10[7..4] BitCast(dw, InterleaveLower(d, vec64, vfd75)); const VW v32_b830 = // v32[b..8] v32[3..0] BitCast(dw, InterleaveUpper(d, va820, vb931)); const VW v32_fc74 = // v32[f..c] v32[7..4] BitCast(dw, InterleaveUpper(d, vec64, vfd75)); v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { using V = VFromD; V vA; // v3210[4] v3210[0] V vB; // v3210[5] v3210[1] V vC; // v3210[6] v3210[2] V vD; // v3210[7] v3210[3] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] v0 = InterleaveLower(d, v10e, v10o); v1 = InterleaveUpper(d, v10e, v10o); v2 = InterleaveLower(d, v32e, v32o); v3 = InterleaveUpper(d, v32e, v32o); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { VFromD vA, vB, vC, vD; detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); v0 = InterleaveLower(d, vA, vC); v1 = InterleaveUpper(d, vA, vC); v2 = InterleaveLower(d, vB, vD); v3 = InterleaveUpper(d, vB, vD); } // Any T x1 template , HWY_IF_LANES_D(D, 1)> HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); v3 = LoadU(d, unaligned + 3); } // ------------------------------ StoreInterleaved2 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks2(VFromD A, VFromD B, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(A, d, unaligned + 0 * kN); StoreU(B, d, unaligned + 1 * kN); } } // namespace detail // >= 128 bit vector template HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, TFromD* HWY_RESTRICT unaligned) { const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2] detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); } // <= 64 bits template HWY_API void StoreInterleaved2(V part0, V part1, D d, TFromD* HWY_RESTRICT unaligned) { const Twice d2; const auto v0 = ZeroExtendVector(d2, part0); const auto v1 = ZeroExtendVector(d2, part1); const auto v10 = InterleaveLower(d2, v0, v1); StoreU(v10, d2, unaligned); } // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, // TableLookupBytes) namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks3(VFromD A, VFromD B, VFromD C, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(A, d, unaligned + 0 * kN); StoreU(B, d, unaligned + 1 * kN); StoreU(C, d, unaligned + 2 * kN); } } // namespace detail // >= 128-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; using TU = TFromD; const auto k5 = Set(du, TU{5}); const auto k6 = Set(du, TU{6}); // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes // to their place, with 0x80 so lanes to be filled from other vectors are 0 // to enable blending by ORing together. alignas(16) static constexpr uint8_t tbl_v0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; alignas(16) static constexpr uint8_t tbl_v1[16] = { 0x80, 0, 0x80, 0x80, 1, 0x80, // 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A0 = LoadDup128(du, tbl_v0); const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5) const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. const VFromD A = BitCast(d, A0 | A1 | A2); // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5] const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6.. const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5 const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5. const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B = BitCast(d, B0 | B1 | B2); // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10] const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B. const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B.. const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A const auto C0 = TableLookupBytesOr0(v0, shuf_C0); const auto C1 = TableLookupBytesOr0(v1, shuf_C1); const auto C2 = TableLookupBytesOr0(v2, shuf_C2); const VFromD C = BitCast(d, C0 | C1 | C2); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // >= 128-bit vector, 16-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const Repartition du8; const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD)}); const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD)}); // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be // filled from other vectors are 0 for blending. Note that these are byte // indices for 16-bit lanes. alignas(16) static constexpr uint8_t tbl_v1[16] = { 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; alignas(16) static constexpr uint8_t tbl_v2[16] = { 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0. // .2..1..0 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0.. const auto A0 = TableLookupBytesOr0(v0, shuf_A0); const auto A1 = TableLookupBytesOr0(v1, shuf_A1); const auto A2 = TableLookupBytesOr0(v2, shuf_A2); const VFromD A = BitCast(d, A0 | A1 | A2); // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2] const auto shuf_B0 = shuf_A1 + k3; // 5..4..3. const auto shuf_B1 = shuf_A2 + k3; // ..4..3.. const auto shuf_B2 = shuf_A0 + k2; // .4..3..2 const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B = BitCast(d, B0 | B1 | B2); // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] const auto shuf_C0 = shuf_B1 + k3; // ..7..6.. const auto shuf_C1 = shuf_B2 + k3; // .7..6..5 const auto shuf_C2 = shuf_B0 + k2; // 7..6..5. const auto C0 = TableLookupBytesOr0(v0, shuf_C0); const auto C1 = TableLookupBytesOr0(v1, shuf_C1); const auto C2 = TableLookupBytesOr0(v2, shuf_C2); const VFromD C = BitCast(d, C0 | C1 | C2); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // >= 128-bit vector, 32-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const RepartitionToWide dw; const VFromD v10_v00 = InterleaveLower(d, v0, v1); const VFromD v01_v20 = OddEven(v0, v2); // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0) const VFromD A = BitCast( d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20))); const VFromD v1_321 = ShiftRightLanes<1>(d, v1); const VFromD v0_32 = ShiftRightLanes<2>(d, v0); const VFromD v21_v11 = OddEven(v2, v1_321); const VFromD v12_v02 = OddEven(v1_321, v0_32); // B: v1[2],v0[2], v2[1],v1[1] const VFromD B = BitCast( d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02))); // Notation refers to the upper 2 lanes of the vector for InterleaveUpper. const VFromD v23_v13 = OddEven(v2, v1_321); const VFromD v03_v22 = OddEven(v0, v2); // C: v2[3],v1[3],v0[3], v2[2] const VFromD C = BitCast( d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13))); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // >= 128-bit vector, 64-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const VFromD A = InterleaveLower(d, v0, v1); const VFromD B = OddEven(v0, v2); const VFromD C = InterleaveUpper(d, v1, v2); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // 64-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and first result. constexpr size_t kFullN = 16 / sizeof(TFromD); const Full128 du; const Full128> d_full; const auto k5 = Set(du, uint8_t{5}); const auto k6 = Set(du, uint8_t{6}); const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be // filled from other vectors are 0 for blending. alignas(16) static constexpr uint8_t tbl_v0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; alignas(16) static constexpr uint8_t tbl_v1[16] = { 0x80, 0, 0x80, 0x80, 1, 0x80, // 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A0 = Load(du, tbl_v0); const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB) const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. const auto A = BitCast(d_full, A0 | A1 | A2); StoreU(A, d_full, unaligned + 0 * kFullN); // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] const auto shuf_B0 = shuf_A2 + k6; // ..7..6.. const auto shuf_B1 = shuf_A0 + k5; // .7..6..5 const auto shuf_B2 = shuf_A1 + k5; // 7..6..5. const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B{BitCast(d_full, B0 | B1 | B2).raw}; StoreU(B, d, unaligned + 1 * kFullN); } // 64-bit vector, 16-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D dh, TFromD* HWY_RESTRICT unaligned) { const Twice d_full; const Full128 du8; const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD)}); const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD)}); const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes // to their place, with 0x80 so lanes to be filled from other vectors are 0 // to enable blending by ORing together. alignas(16) static constexpr uint8_t tbl_v1[16] = { 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; alignas(16) static constexpr uint8_t tbl_v2[16] = { 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; // The interleaved vectors will be named A, B; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0. // .2..1..0 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0.. const auto A0 = TableLookupBytesOr0(v0, shuf_A0); const auto A1 = TableLookupBytesOr0(v1, shuf_A1); const auto A2 = TableLookupBytesOr0(v2, shuf_A2); const VFromD A = BitCast(d_full, A0 | A1 | A2); StoreU(A, d_full, unaligned); // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2] const auto shuf_B0 = shuf_A1 + k3; // ..3. const auto shuf_B1 = shuf_A2 + k3; // .3.. const auto shuf_B2 = shuf_A0 + k2; // 3..2 const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B = BitCast(d_full, B0 | B1 | B2); StoreU(VFromD{B.raw}, dh, unaligned + MaxLanes(d_full)); } // 64-bit vector, 32-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { // (same code as 128-bit vector, 64-bit lanes) const VFromD v10_v00 = InterleaveLower(d, v0, v1); const VFromD v01_v20 = OddEven(v0, v2); const VFromD v21_v11 = InterleaveUpper(d, v1, v2); constexpr size_t kN = MaxLanes(d); StoreU(v10_v00, d, unaligned + 0 * kN); StoreU(v01_v20, d, unaligned + 1 * kN); StoreU(v21_v11, d, unaligned + 2 * kN); } // 64-bit lanes are handled by the N=1 case below. // <= 32-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and result. const Full128 du; const Full128> d_full; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 // so lanes to be filled from other vectors are 0 to enable blending by ORing // together. alignas(16) static constexpr uint8_t tbl_v0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // The interleaved vector will be named A; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A0 = Load(du, tbl_v0); const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0); const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0.. const VFromD A = BitCast(d_full, A0 | A1 | A2); alignas(16) TFromD buf[MaxLanes(d_full)]; StoreU(A, d_full, buf); CopyBytes(buf, unaligned); } // 32-bit vector, 16-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and result. const Full128 du8; const Full128> d_full; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 // so lanes to be filled from other vectors are 0 to enable blending by ORing // together. alignas(16) static constexpr uint8_t tbl_v2[16] = { 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; // The interleaved vector will be named A; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A2 = // ..1..0.. Load(du8, tbl_v2); const auto shuf_A1 = // ...1..0. CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); const auto shuf_A0 = // ....1..0 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0.. const auto A = BitCast(d_full, A0 | A1 | A2); alignas(16) TFromD buf[MaxLanes(d_full)]; StoreU(A, d_full, buf); CopyBytes(buf, unaligned); } // Single-element vector, any lane size: just store directly template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { StoreU(v0, d, unaligned + 0); StoreU(v1, d, unaligned + 1); StoreU(v2, d, unaligned + 2); } // ------------------------------ StoreInterleaved4 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks4(VFromD vA, VFromD vB, VFromD vC, VFromD vD, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(vA, d, unaligned + 0 * kN); StoreU(vB, d, unaligned + 1 * kN); StoreU(vC, d, unaligned + 2 * kN); StoreU(vD, d, unaligned + 3 * kN); } } // namespace detail // >= 128-bit vector, 8..32-bit lanes template HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, VFromD v3, D d, TFromD* HWY_RESTRICT unaligned) { const RepartitionToWide dw; const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0] const auto v32L = ZipLower(dw, v2, v3); const auto v10U = ZipUpper(dw, v0, v1); const auto v32U = ZipUpper(dw, v2, v3); // The interleaved vectors are vA, vB, vC, vD. const VFromD vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210 const VFromD vB = BitCast(d, InterleaveUpper(dw, v10L, v32L)); const VFromD vC = BitCast(d, InterleaveLower(dw, v10U, v32U)); const VFromD vD = BitCast(d, InterleaveUpper(dw, v10U, v32U)); detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); } // >= 128-bit vector, 64-bit lanes template HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, VFromD v3, D d, TFromD* HWY_RESTRICT unaligned) { // The interleaved vectors are vA, vB, vC, vD. const VFromD vA = InterleaveLower(d, v0, v1); // v1[0] v0[0] const VFromD vB = InterleaveLower(d, v2, v3); const VFromD vC = InterleaveUpper(d, v0, v1); const VFromD vD = InterleaveUpper(d, v2, v3); detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); } // 64-bit vector, 8..32-bit lanes template HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, VFromD part2, VFromD part3, D /* tag */, TFromD* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128> d_full; const RepartitionToWide dw; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; const VFromD v3{part3.raw}; const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0] const auto v32 = ZipLower(dw, v2, v3); const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32)); const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32)); StoreU(A, d_full, unaligned); StoreU(B, d_full, unaligned + MaxLanes(d_full)); } // 64-bit vector, 64-bit lane template HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, VFromD part2, VFromD part3, D /* tag */, TFromD* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128> d_full; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; const VFromD v3{part3.raw}; const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0] const auto B = InterleaveLower(d_full, v2, v3); StoreU(A, d_full, unaligned); StoreU(B, d_full, unaligned + MaxLanes(d_full)); } // <= 32-bit vectors template HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, VFromD part2, VFromD part3, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128> d_full; const RepartitionToWide dw; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; const VFromD v3{part3.raw}; const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0] const auto v32 = ZipLower(dw, v2, v3); const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32)); alignas(16) TFromD buf[MaxLanes(d_full)]; StoreU(v3210, d_full, buf); CopyBytes(buf, unaligned); } #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff #if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTEGER_ABS_DIFF #undef HWY_NATIVE_INTEGER_ABS_DIFF #else #define HWY_NATIVE_INTEGER_ABS_DIFF #endif template HWY_API V AbsDiff(V a, V b) { return Sub(Max(a, b), Min(a, b)); } #endif // HWY_NATIVE_INTEGER_ABS_DIFF #if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF #endif template ), HWY_IF_V_SIZE_GT_D(DFromV, (HWY_TARGET == HWY_SCALAR ? 0 : 4))> HWY_API Vec>> SumsOf8AbsDiff(V a, V b) { return SumsOf8(AbsDiff(a, b)); } #endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF // ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64 #if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB #undef HWY_NATIVE_I32_SATURATED_ADDSUB #else #define HWY_NATIVE_I32_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { const DFromV d; const auto sum = Add(a, b); const auto overflow_mask = MaskFromVec(BroadcastSignBit(AndNot(Xor(a, b), Xor(a, sum)))); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfThenElse(overflow_mask, overflow_result, sum); } template )> HWY_API V SaturatedSub(V a, V b) { const DFromV d; const auto diff = Sub(a, b); const auto overflow_mask = MaskFromVec(BroadcastSignBit(And(Xor(a, b), Xor(a, diff)))); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_NATIVE_I32_SATURATED_ADDSUB #if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB #undef HWY_NATIVE_I64_SATURATED_ADDSUB #else #define HWY_NATIVE_I64_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { const DFromV d; const auto sum = Add(a, b); const auto overflow_mask = MaskFromVec(BroadcastSignBit(AndNot(Xor(a, b), Xor(a, sum)))); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfThenElse(overflow_mask, overflow_result, sum); } template )> HWY_API V SaturatedSub(V a, V b) { const DFromV d; const auto diff = Sub(a, b); const auto overflow_mask = MaskFromVec(BroadcastSignBit(And(Xor(a, b), Xor(a, diff)))); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_NATIVE_I64_SATURATED_ADDSUB #if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB #undef HWY_NATIVE_U32_SATURATED_ADDSUB #else #define HWY_NATIVE_U32_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { return Add(a, Min(b, Not(a))); } template )> HWY_API V SaturatedSub(V a, V b) { return Sub(a, Min(a, b)); } #endif // HWY_NATIVE_U32_SATURATED_ADDSUB #if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB #undef HWY_NATIVE_U64_SATURATED_ADDSUB #else #define HWY_NATIVE_U64_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { return Add(a, Min(b, Not(a))); } template )> HWY_API V SaturatedSub(V a, V b) { return Sub(a, Min(a, b)); } #endif // HWY_NATIVE_U64_SATURATED_ADDSUB // ------------------------------ Unsigned to signed demotions template , DN>>, hwy::EnableIf<(sizeof(TFromD) < sizeof(TFromV))>* = nullptr, HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_D(DFromV))> HWY_API VFromD DemoteTo(DN dn, V v) { const DFromV d; const RebindToSigned di; const RebindToUnsigned dn_u; // First, do a signed to signed demotion. This will convert any values // that are greater than hwy::HighestValue>>() to a // negative value. const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v)); // Second, convert any negative values to hwy::HighestValue>() // using an unsigned Min operation. const auto max_signed_val = Set(dn, hwy::HighestValue>()); return BitCast( dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); } #if HWY_TARGET != HWY_SCALAR || HWY_IDE template , DN>>, HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(DFromV