diff options
Diffstat (limited to 'third_party/highway/hwy/ops')
-rw-r--r-- | third_party/highway/hwy/ops/arm_neon-inl.h | 7725 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/arm_sve-inl.h | 4596 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/emu128-inl.h | 2704 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/generic_ops-inl.h | 3190 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/ppc_vsx-inl.h | 4920 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/rvv-inl.h | 4229 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/scalar-inl.h | 1845 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/set_macros-inl.h | 566 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/shared-inl.h | 488 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/tuple-inl.h | 86 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/wasm_128-inl.h | 5060 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/wasm_256-inl.h | 2030 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/x86_128-inl.h | 9038 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/x86_256-inl.h | 6476 | ||||
-rw-r--r-- | third_party/highway/hwy/ops/x86_512-inl.h | 5733 |
15 files changed, 58686 insertions, 0 deletions
diff --git a/third_party/highway/hwy/ops/arm_neon-inl.h b/third_party/highway/hwy/ops/arm_neon-inl.h new file mode 100644 index 0000000000..bd2cddcb86 --- /dev/null +++ b/third_party/highway/hwy/ops/arm_neon-inl.h @@ -0,0 +1,7725 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit Arm NEON vectors and operations. +// External include guard in highway.h - see comment there. + +// Arm NEON intrinsics are documented at: +// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] + +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); + +// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with +// the same target attribute as our code, see #834. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +#include <arm_neon.h> // NOLINT(build/include_order) +HWY_DIAGNOSTICS(pop) + +// Must come after arm_neon.h. +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { // for code folding and Raw128 + +// Macros used to define single and double function calls for multiple types +// for full and half vectors. These macros are undefined at the end of the file. + +// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function. +#define HWY_NEON_BUILD_TPL_1 +#define HWY_NEON_BUILD_TPL_2 +#define HWY_NEON_BUILD_TPL_3 + +// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can +// extend it to int32x4x2_t packs. +#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size> +#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size> +#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size> + +// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives. +#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a +#define HWY_NEON_BUILD_PARAM_2(type, size) \ + const Vec128<type##_t, size> a, const Vec128<type##_t, size> b +#define HWY_NEON_BUILD_PARAM_3(type, size) \ + const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \ + const Vec128<type##_t, size> c + +// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying +// function. +#define HWY_NEON_BUILD_ARG_1 a.raw +#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw +#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw + +// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after +// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on +// itself like with some of the library "functions" such as vshlq_u8. For +// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as +// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed. +// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro +// expects two arguments. +#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__) + +// Main macro definition that defines a single function for the given type and +// size of vector, using the underlying (prefix##infix##suffix) function and +// the template, return type, parameters and arguments defined by the "args" +// parameters passed here (see HWY_NEON_BUILD_* macros defined before). +#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ + HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \ + HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \ + name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \ + return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \ + HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \ + } + +// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function +// called "name" using the set of neon functions starting with the given +// "prefix" for all the variants of certain types, as specified next to each +// macro. For example, the prefix "vsub" can be used to define the operator- +// using args=2. + +// uint8_t +#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args) + +// int8_t +#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args) + +// uint16_t +#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args) + +// int16_t +#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args) + +// uint32_t +#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args) + +// int32_t +#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ + HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \ + HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args) + +// uint64_t +#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) + +// int64_t +#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \ + HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) + +// float +#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \ + HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \ + HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args) + +// double +#if HWY_ARCH_ARM_A64 +#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \ + HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args) +#else +#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) +#endif + +// float and double + +#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) + +// Helper macros to define for more than one type. +// uint8_t, uint16_t and uint32_t +#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) + +// int8_t, int16_t and int32_t +#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) + +// uint8_t, uint16_t, uint32_t and uint64_t +#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) + +// int8_t, int16_t, int32_t and int64_t +#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) + +// All int*_t and uint*_t up to 64 +#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) + +// All previous types. +#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) + +#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) + +// For eor3q, which is only defined for full vectors. +#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ + HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ + HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) + +// Emulation of some intrinsics on armv7. +#if HWY_ARCH_ARM_V7 +#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] +#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] +#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] +#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0] +#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0] +#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0] +#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0] +#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0] +#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0] +#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0] +#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0] +#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0] +#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0] +#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0] +#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1] +#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1] +#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1] +#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1] +#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1] +#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1] +#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1] +#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1] +#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1] +#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1] +#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1] +#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1] +#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1] +#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1] +#define vzip1_s8(x, y) vzip_s8(x, y).val[0] +#define vzip1_u8(x, y) vzip_u8(x, y).val[0] +#define vzip1_s16(x, y) vzip_s16(x, y).val[0] +#define vzip1_u16(x, y) vzip_u16(x, y).val[0] +#define vzip1_f32(x, y) vzip_f32(x, y).val[0] +#define vzip1_u32(x, y) vzip_u32(x, y).val[0] +#define vzip1_s32(x, y) vzip_s32(x, y).val[0] +#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0] +#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0] +#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0] +#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0] +#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0] +#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0] +#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0] +#define vzip2_s8(x, y) vzip_s8(x, y).val[1] +#define vzip2_u8(x, y) vzip_u8(x, y).val[1] +#define vzip2_s16(x, y) vzip_s16(x, y).val[1] +#define vzip2_u16(x, y) vzip_u16(x, y).val[1] +#define vzip2_s32(x, y) vzip_s32(x, y).val[1] +#define vzip2_u32(x, y) vzip_u32(x, y).val[1] +#define vzip2_f32(x, y) vzip_f32(x, y).val[1] +#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1] +#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1] +#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1] +#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1] +#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1] +#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1] +#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1] +#endif + +// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads +// for all vector types, even those (bfloat16_t) where the underlying vector is +// the same as others (uint16_t). +template <typename T, size_t N> +struct Tuple2; +template <typename T, size_t N> +struct Tuple3; +template <typename T, size_t N> +struct Tuple4; + +template <> +struct Tuple2<uint8_t, 16> { + uint8x16x2_t raw; +}; +template <size_t N> +struct Tuple2<uint8_t, N> { + uint8x8x2_t raw; +}; +template <> +struct Tuple2<int8_t, 16> { + int8x16x2_t raw; +}; +template <size_t N> +struct Tuple2<int8_t, N> { + int8x8x2_t raw; +}; +template <> +struct Tuple2<uint16_t, 8> { + uint16x8x2_t raw; +}; +template <size_t N> +struct Tuple2<uint16_t, N> { + uint16x4x2_t raw; +}; +template <> +struct Tuple2<int16_t, 8> { + int16x8x2_t raw; +}; +template <size_t N> +struct Tuple2<int16_t, N> { + int16x4x2_t raw; +}; +template <> +struct Tuple2<uint32_t, 4> { + uint32x4x2_t raw; +}; +template <size_t N> +struct Tuple2<uint32_t, N> { + uint32x2x2_t raw; +}; +template <> +struct Tuple2<int32_t, 4> { + int32x4x2_t raw; +}; +template <size_t N> +struct Tuple2<int32_t, N> { + int32x2x2_t raw; +}; +template <> +struct Tuple2<uint64_t, 2> { + uint64x2x2_t raw; +}; +template <size_t N> +struct Tuple2<uint64_t, N> { + uint64x1x2_t raw; +}; +template <> +struct Tuple2<int64_t, 2> { + int64x2x2_t raw; +}; +template <size_t N> +struct Tuple2<int64_t, N> { + int64x1x2_t raw; +}; + +template <> +struct Tuple2<float16_t, 8> { + uint16x8x2_t raw; +}; +template <size_t N> +struct Tuple2<float16_t, N> { + uint16x4x2_t raw; +}; +template <> +struct Tuple2<bfloat16_t, 8> { + uint16x8x2_t raw; +}; +template <size_t N> +struct Tuple2<bfloat16_t, N> { + uint16x4x2_t raw; +}; + +template <> +struct Tuple2<float32_t, 4> { + float32x4x2_t raw; +}; +template <size_t N> +struct Tuple2<float32_t, N> { + float32x2x2_t raw; +}; +#if HWY_ARCH_ARM_A64 +template <> +struct Tuple2<float64_t, 2> { + float64x2x2_t raw; +}; +template <size_t N> +struct Tuple2<float64_t, N> { + float64x1x2_t raw; +}; +#endif // HWY_ARCH_ARM_A64 + +template <> +struct Tuple3<uint8_t, 16> { + uint8x16x3_t raw; +}; +template <size_t N> +struct Tuple3<uint8_t, N> { + uint8x8x3_t raw; +}; +template <> +struct Tuple3<int8_t, 16> { + int8x16x3_t raw; +}; +template <size_t N> +struct Tuple3<int8_t, N> { + int8x8x3_t raw; +}; +template <> +struct Tuple3<uint16_t, 8> { + uint16x8x3_t raw; +}; +template <size_t N> +struct Tuple3<uint16_t, N> { + uint16x4x3_t raw; +}; +template <> +struct Tuple3<int16_t, 8> { + int16x8x3_t raw; +}; +template <size_t N> +struct Tuple3<int16_t, N> { + int16x4x3_t raw; +}; +template <> +struct Tuple3<uint32_t, 4> { + uint32x4x3_t raw; +}; +template <size_t N> +struct Tuple3<uint32_t, N> { + uint32x2x3_t raw; +}; +template <> +struct Tuple3<int32_t, 4> { + int32x4x3_t raw; +}; +template <size_t N> +struct Tuple3<int32_t, N> { + int32x2x3_t raw; +}; +template <> +struct Tuple3<uint64_t, 2> { + uint64x2x3_t raw; +}; +template <size_t N> +struct Tuple3<uint64_t, N> { + uint64x1x3_t raw; +}; +template <> +struct Tuple3<int64_t, 2> { + int64x2x3_t raw; +}; +template <size_t N> +struct Tuple3<int64_t, N> { + int64x1x3_t raw; +}; + +template <> +struct Tuple3<float16_t, 8> { + uint16x8x3_t raw; +}; +template <size_t N> +struct Tuple3<float16_t, N> { + uint16x4x3_t raw; +}; +template <> +struct Tuple3<bfloat16_t, 8> { + uint16x8x3_t raw; +}; +template <size_t N> +struct Tuple3<bfloat16_t, N> { + uint16x4x3_t raw; +}; + +template <> +struct Tuple3<float32_t, 4> { + float32x4x3_t raw; +}; +template <size_t N> +struct Tuple3<float32_t, N> { + float32x2x3_t raw; +}; +#if HWY_ARCH_ARM_A64 +template <> +struct Tuple3<float64_t, 2> { + float64x2x3_t raw; +}; +template <size_t N> +struct Tuple3<float64_t, N> { + float64x1x3_t raw; +}; +#endif // HWY_ARCH_ARM_A64 + +template <> +struct Tuple4<uint8_t, 16> { + uint8x16x4_t raw; +}; +template <size_t N> +struct Tuple4<uint8_t, N> { + uint8x8x4_t raw; +}; +template <> +struct Tuple4<int8_t, 16> { + int8x16x4_t raw; +}; +template <size_t N> +struct Tuple4<int8_t, N> { + int8x8x4_t raw; +}; +template <> +struct Tuple4<uint16_t, 8> { + uint16x8x4_t raw; +}; +template <size_t N> +struct Tuple4<uint16_t, N> { + uint16x4x4_t raw; +}; +template <> +struct Tuple4<int16_t, 8> { + int16x8x4_t raw; +}; +template <size_t N> +struct Tuple4<int16_t, N> { + int16x4x4_t raw; +}; +template <> +struct Tuple4<uint32_t, 4> { + uint32x4x4_t raw; +}; +template <size_t N> +struct Tuple4<uint32_t, N> { + uint32x2x4_t raw; +}; +template <> +struct Tuple4<int32_t, 4> { + int32x4x4_t raw; +}; +template <size_t N> +struct Tuple4<int32_t, N> { + int32x2x4_t raw; +}; +template <> +struct Tuple4<uint64_t, 2> { + uint64x2x4_t raw; +}; +template <size_t N> +struct Tuple4<uint64_t, N> { + uint64x1x4_t raw; +}; +template <> +struct Tuple4<int64_t, 2> { + int64x2x4_t raw; +}; +template <size_t N> +struct Tuple4<int64_t, N> { + int64x1x4_t raw; +}; + +template <> +struct Tuple4<float16_t, 8> { + uint16x8x4_t raw; +}; +template <size_t N> +struct Tuple4<float16_t, N> { + uint16x4x4_t raw; +}; +template <> +struct Tuple4<bfloat16_t, 8> { + uint16x8x4_t raw; +}; +template <size_t N> +struct Tuple4<bfloat16_t, N> { + uint16x4x4_t raw; +}; + +template <> +struct Tuple4<float32_t, 4> { + float32x4x4_t raw; +}; +template <size_t N> +struct Tuple4<float32_t, N> { + float32x2x4_t raw; +}; +#if HWY_ARCH_ARM_A64 +template <> +struct Tuple4<float64_t, 2> { + float64x2x4_t raw; +}; +template <size_t N> +struct Tuple4<float64_t, N> { + float64x1x4_t raw; +}; +#endif // HWY_ARCH_ARM_A64 + +template <typename T, size_t N> +struct Raw128; + +// 128 +template <> +struct Raw128<uint8_t, 16> { + using type = uint8x16_t; +}; + +template <> +struct Raw128<uint16_t, 8> { + using type = uint16x8_t; +}; + +template <> +struct Raw128<uint32_t, 4> { + using type = uint32x4_t; +}; + +template <> +struct Raw128<uint64_t, 2> { + using type = uint64x2_t; +}; + +template <> +struct Raw128<int8_t, 16> { + using type = int8x16_t; +}; + +template <> +struct Raw128<int16_t, 8> { + using type = int16x8_t; +}; + +template <> +struct Raw128<int32_t, 4> { + using type = int32x4_t; +}; + +template <> +struct Raw128<int64_t, 2> { + using type = int64x2_t; +}; + +template <> +struct Raw128<float16_t, 8> { + using type = uint16x8_t; +}; + +template <> +struct Raw128<bfloat16_t, 8> { + using type = uint16x8_t; +}; + +template <> +struct Raw128<float, 4> { + using type = float32x4_t; +}; + +#if HWY_ARCH_ARM_A64 +template <> +struct Raw128<double, 2> { + using type = float64x2_t; +}; +#endif + +// 64 +template <> +struct Raw128<uint8_t, 8> { + using type = uint8x8_t; +}; + +template <> +struct Raw128<uint16_t, 4> { + using type = uint16x4_t; +}; + +template <> +struct Raw128<uint32_t, 2> { + using type = uint32x2_t; +}; + +template <> +struct Raw128<uint64_t, 1> { + using type = uint64x1_t; +}; + +template <> +struct Raw128<int8_t, 8> { + using type = int8x8_t; +}; + +template <> +struct Raw128<int16_t, 4> { + using type = int16x4_t; +}; + +template <> +struct Raw128<int32_t, 2> { + using type = int32x2_t; +}; + +template <> +struct Raw128<int64_t, 1> { + using type = int64x1_t; +}; + +template <> +struct Raw128<float16_t, 4> { + using type = uint16x4_t; +}; + +template <> +struct Raw128<bfloat16_t, 4> { + using type = uint16x4_t; +}; + +template <> +struct Raw128<float, 2> { + using type = float32x2_t; +}; + +#if HWY_ARCH_ARM_A64 +template <> +struct Raw128<double, 1> { + using type = float64x1_t; +}; +#endif + +// 32 (same as 64) +template <> +struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {}; + +template <> +struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {}; + +template <> +struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {}; + +template <> +struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {}; + +template <> +struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {}; + +template <> +struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {}; + +template <> +struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {}; + +template <> +struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {}; + +template <> +struct Raw128<float, 1> : public Raw128<float, 2> {}; + +// 16 (same as 64) +template <> +struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {}; + +template <> +struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {}; + +template <> +struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {}; + +template <> +struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {}; + +template <> +struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {}; + +template <> +struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {}; + +// 8 (same as 64) +template <> +struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {}; + +template <> +struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {}; + +} // namespace detail + +template <typename T, size_t N = 16 / sizeof(T)> +class Vec128 { + using Raw = typename detail::Raw128<T, N>::type; + + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = N; // only for DFromV + + HWY_INLINE Vec128() {} + Vec128(const Vec128&) = default; + Vec128& operator=(const Vec128&) = default; + HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {} + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template <typename T> +using Vec64 = Vec128<T, 8 / sizeof(T)>; + +template <typename T> +using Vec32 = Vec128<T, 4 / sizeof(T)>; + +template <typename T> +using Vec16 = Vec128<T, 2 / sizeof(T)>; + +// FF..FF or 0. +template <typename T, size_t N = 16 / sizeof(T)> +class Mask128 { + // Arm C Language Extensions return and expect unsigned type. + using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type; + + public: + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = N; // only for DFromM + + HWY_INLINE Mask128() {} + Mask128(const Mask128&) = default; + Mask128& operator=(const Mask128&) = default; + HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {} + + Raw raw; +}; + +template <typename T> +using Mask64 = Mask128<T, 8 / sizeof(T)>; + +template <class V> +using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; + +template <class M> +using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; + +template <class V> +using TFromV = typename V::PrivateT; + +// ------------------------------ Set + +namespace detail { +// We want to route any combination of N/kPow2 to the intrinsics depending on +// whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is +// unconditional and currently does not accept inputs (such as whether the +// vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for +// SFINAE. We instead define a private NativeSet which receives a Simd<> whose +// kPow2 has already been folded into its N. +#define HWY_NEON_BUILD_TPL_HWY_SET +#define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size> +#define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \ + Simd<type##_t, size, 0> /* tag */, type##_t t +#define HWY_NEON_BUILD_ARG_HWY_SET t + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET) + +#undef HWY_NEON_BUILD_TPL_HWY_SET +#undef HWY_NEON_BUILD_RET_HWY_SET +#undef HWY_NEON_BUILD_PARAM_HWY_SET +#undef HWY_NEON_BUILD_ARG_HWY_SET + +} // namespace detail + +// Full vector. Cannot yet use VFromD because that is defined in terms of Set. +// Do not use a typename T = TFromD<D> argument because T will be deduced from +// the actual argument type, which can differ from TFromD<D>. +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T> +HWY_INLINE Vec128<TFromD<D>> Set(D /* tag */, T t) { + return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t)); +} + +// Partial vector: create 64-bit and return wrapper. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T> +HWY_API Vec128<TFromD<D>, MaxLanes(D())> Set(D /* tag */, T t) { + const Full64<TFromD<D>> dfull; + return Vec128<TFromD<D>, MaxLanes(D())>( + detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw); +} + +// BF16: return u16. +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec128<bfloat16_t, MaxLanes(D())> Set(D d, bfloat16_t t) { + uint16_t tu; + CopyBytes<sizeof(tu)>(&t, &tu); + return Vec128<bfloat16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw); +} + +// F16: return u16. +template <class D, HWY_IF_F16_D(D)> +HWY_API Vec128<float16_t, MaxLanes(D())> Set(D d, float16_t t) { + uint16_t tu; + CopyBytes<sizeof(tu)>(&t, &tu); + return Vec128<float16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw); +} + +template <class D> +using VFromD = decltype(Set(D(), TFromD<D>())); + +template <class D> +HWY_API VFromD<D> Zero(D d) { + // Default ctor also works for bfloat16_t and float16_t. + return Set(d, TFromD<D>{}); +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") +#endif + +template <class D> +HWY_API VFromD<D> Undefined(D /*tag*/) { + VFromD<D> v; + return v; +} + +HWY_DIAGNOSTICS(pop) + +namespace detail { + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(8))); + constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; + const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x8_t>(kU8Iota0)); +#else + alignas(8) static constexpr uint8_t kU8Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + const VFromD<decltype(du)> vu8_iota0( + Load(Full64<TFromD<decltype(du)>>(), kU8Iota0).raw); +#endif + return BitCast(d, vu8_iota0); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); + constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x16_t>(kU8Iota0)); +#else + alignas(16) static constexpr uint8_t kU8Iota0[16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const auto vu8_iota0 = Load(du, kU8Iota0); +#endif + return BitCast(d, vu8_iota0); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(8))); + constexpr GccU16RawVectType kU16Iota0 = {0, 1, 2, 3}; + const VFromD<decltype(du)> vu16_iota0( + reinterpret_cast<uint16x4_t>(kU16Iota0)); +#else + alignas(8) static constexpr uint16_t kU16Iota0[4] = {0, 1, 2, 3}; + const VFromD<decltype(du)> vu16_iota0{ + Load(Full64<TFromD<decltype(du)>>(), kU16Iota0).raw}; +#endif + return BitCast(d, vu16_iota0); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16))); + constexpr GccU16RawVectType kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; + const VFromD<decltype(du)> vu16_iota0( + reinterpret_cast<uint16x8_t>(kU16Iota0)); +#else + alignas(16) static constexpr uint16_t kU16Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + const auto vu16_iota0 = Load(du, kU16Iota0); +#endif + return BitCast(d, vu16_iota0); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8))); + constexpr GccU32RawVectType kU32Iota0 = {0, 1}; + const VFromD<decltype(du)> vu32_iota0( + reinterpret_cast<uint32x2_t>(kU32Iota0)); +#else + alignas(8) static constexpr uint32_t kU32Iota0[2] = {0, 1}; + const VFromD<decltype(du)> vu32_iota0{ + Load(Full64<TFromD<decltype(du)>>(), kU32Iota0).raw}; +#endif + return BitCast(d, vu32_iota0); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); + constexpr GccU32RawVectType kU32Iota0 = {0, 1, 2, 3}; + const VFromD<decltype(du)> vu32_iota0( + reinterpret_cast<uint32x4_t>(kU32Iota0)); +#else + alignas(16) static constexpr uint32_t kU32Iota0[4] = {0, 1, 2, 3}; + const auto vu32_iota0 = Load(du, kU32Iota0); +#endif + return BitCast(d, vu32_iota0); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef float GccF32RawVectType __attribute__((__vector_size__(8))); + constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f}; + return VFromD<decltype(d)>(reinterpret_cast<float32x2_t>(kF32Iota0)); +#else + alignas(8) static constexpr float kF32Iota0[2] = {0.0f, 1.0f}; + return VFromD<decltype(d)>{ + Load(Full64<TFromD<decltype(d)>>(), kF32Iota0).raw}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef float GccF32RawVectType __attribute__((__vector_size__(16))); + constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f}; + return VFromD<decltype(d)>(reinterpret_cast<float32x4_t>(kF32Iota0)); +#else + alignas(16) static constexpr float kF32Iota0[4] = {0.0f, 1.0f, 2.0f, 3.0f}; + return Load(d, kF32Iota0); +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<D> Iota0(D d) { + return Zero(d); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef uint64_t GccU64RawVectType __attribute__((__vector_size__(16))); + constexpr GccU64RawVectType kU64Iota0 = {0, 1}; + const VFromD<decltype(du)> vu64_iota0( + reinterpret_cast<uint64x2_t>(kU64Iota0)); +#else + alignas(16) static constexpr uint64_t kU64Iota0[4] = {0, 1}; + const auto vu64_iota0 = Load(du, kU64Iota0); +#endif + return BitCast(d, vu64_iota0); +} + +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { +#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL + typedef double GccF64RawVectType __attribute__((__vector_size__(16))); + constexpr GccF64RawVectType kF64Iota0 = {0.0, 1.0}; + return VFromD<decltype(d)>(reinterpret_cast<float64x2_t>(kF64Iota0)); +#else + alignas(16) static constexpr double kF64Iota0[4] = {0.0, 1.0}; + return Load(d, kF64Iota0); +#endif +} +#endif // HWY_ARCH_ARM_A64 + +#if HWY_COMPILER_MSVC +template <class V, HWY_IF_V_SIZE_LE_V(V, 4)> +static HWY_INLINE V MaskOutIota(V v) { + constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV<V>); + constexpr uint64_t kU64MaskOutMask = + hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>(); + + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> du8; + using VU8 = VFromD<decltype(du8)>; + const auto mask_out_mask = + BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask)))); + return v & mask_out_mask; +} +template <class V, HWY_IF_V_SIZE_GT_V(V, 4)> +static HWY_INLINE V MaskOutIota(V v) { + return v; +} +#endif + +} // namespace detail + +template <class D, typename T2> +HWY_API VFromD<D> Iota(D d, const T2 first) { + const auto result_iota = + detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); +#if HWY_COMPILER_MSVC + return detail::MaskOutIota(result_iota); +#else + return result_iota; +#endif +} + +// ------------------------------ Tuple (VFromD) +#include "hwy/ops/tuple-inl.h" + +// ------------------------------ Combine + +// Full result +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> Combine(D /* tag */, Vec64<uint8_t> hi, + Vec64<uint8_t> lo) { + return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw)); +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> Combine(D /* tag */, Vec64<uint16_t> hi, + Vec64<uint16_t> lo) { + return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw)); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> Combine(D /* tag */, Vec64<uint32_t> hi, + Vec64<uint32_t> lo) { + return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw)); +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec128<uint64_t> Combine(D /* tag */, Vec64<uint64_t> hi, + Vec64<uint64_t> lo) { + return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw)); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> Combine(D /* tag */, Vec64<int8_t> hi, + Vec64<int8_t> lo) { + return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> Combine(D /* tag */, Vec64<int16_t> hi, + Vec64<int16_t> lo) { + return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw)); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> Combine(D /* tag */, Vec64<int32_t> hi, + Vec64<int32_t> lo) { + return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw)); +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi, + Vec64<int64_t> lo) { + return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw)); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> Combine(D /* tag */, Vec64<float> hi, Vec64<float> lo) { + return Vec128<float>(vcombine_f32(lo.raw, hi.raw)); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi, + Vec64<double> lo) { + return Vec128<double>(vcombine_f64(lo.raw, hi.raw)); +} +#endif + +// ------------------------------ BitCast + +namespace detail { + +// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the +// vreinterpret*_u8_*() set of functions. +#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 +#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \ + Vec128<uint8_t, size * sizeof(type##_t)> +#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v +#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw + +// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined. +template <size_t N> +HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) { + return v; +} + +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, + HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) + +// Special cases for [b]float16_t, which have the same Raw as uint16_t. +template <size_t N> +HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) { + return BitCastToByte(Vec128<uint16_t, N>(v.raw)); +} +template <size_t N> +HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) { + return BitCastToByte(Vec128<uint16_t, N>(v.raw)); +} + +#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 +#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8 +#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8 +#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 + +template <class D, HWY_IF_U8_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) { + return v; +} + +// 64-bit or less: + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<RebindToUnsigned<D>> v) { + return VFromD<D>(vreinterpret_s8_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(vreinterpret_u16_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(vreinterpret_s16_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(vreinterpret_u32_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(vreinterpret_s32_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(vreinterpret_f32_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)> +HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) { + return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)> +HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) { + return Vec64<int64_t>(vreinterpret_s64_u8(v.raw)); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)> +HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) { + return Vec64<double>(vreinterpret_f64_u8(v.raw)); +} +#endif + +// 128-bit full: + +template <class D, HWY_IF_I8_D(D)> +HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw)); +} +template <class D, HWY_IF_U16_D(D)> +HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw)); +} +template <class D, HWY_IF_U32_D(D)> +HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw)); +} +template <class D, HWY_IF_I32_D(D)> +HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw)); +} +template <class D, HWY_IF_F32_D(D)> +HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<float>(vreinterpretq_f32_u8(v.raw)); +} +template <class D, HWY_IF_U64_D(D)> +HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw)); +} +template <class D, HWY_IF_I64_D(D)> +HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw)); +} + +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_F64_D(D)> +HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { + return Vec128<double>(vreinterpretq_f64_u8(v.raw)); +} +#endif + +// Special cases for [b]float16_t, which have the same Raw as uint16_t. +template <class D, HWY_IF_F16_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw); +} +template <class D, HWY_IF_BF16_D(D)> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + VFromD<Repartition<uint8_t, D>> v) { + return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw); +} + +} // namespace detail + +template <class D, class FromT> +HWY_API VFromD<D> BitCast(D d, + Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ ResizeBitCast + +// <= 8 byte vector to <= 8 byte vector +template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8), + HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw}); +} + +// 16-byte vector to 16-byte vector: same as BitCast +template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16), + HWY_IF_V_SIZE_D(D, 16)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, v); +} + +// 16-byte vector to <= 8-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16), + HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const DFromV<decltype(v)> d_from; + const Half<decltype(d_from)> dh_from; + return ResizeBitCast(d, LowerHalf(dh_from, v)); +} + +// <= 8-bit vector to 16-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8), + HWY_IF_V_SIZE_D(D, 16)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const Full64<TFromV<FromV>> d_full64_from; + const Full128<TFromV<FromV>> d_full128_from; + return BitCast(d, Combine(d_full128_from, Zero(d_full64_from), + ResizeBitCast(d_full64_from, v))); +} + +// ------------------------------ GetLane + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane> +#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t +#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v +#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET) + +#undef HWY_NEON_BUILD_TPL_HWY_GET +#undef HWY_NEON_BUILD_RET_HWY_GET +#undef HWY_NEON_BUILD_PARAM_HWY_GET +#undef HWY_NEON_BUILD_ARG_HWY_GET + +} // namespace detail + +template <class V> +HWY_API TFromV<V> GetLane(const V v) { + return detail::GetLane<0>(v); +} + +// ------------------------------ ExtractLane + +// Requires one overload per vector length because GetLane<3> is a compile error +// if v is a uint32x2_t. +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return detail::GetLane<0>(v); +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + } + } +#endif + alignas(16) T lanes[2]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + case 2: + return detail::GetLane<2>(v); + case 3: + return detail::GetLane<3>(v); + } + } +#endif + alignas(16) T lanes[4]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + case 2: + return detail::GetLane<2>(v); + case 3: + return detail::GetLane<3>(v); + case 4: + return detail::GetLane<4>(v); + case 5: + return detail::GetLane<5>(v); + case 6: + return detail::GetLane<6>(v); + case 7: + return detail::GetLane<7>(v); + } + } +#endif + alignas(16) T lanes[8]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + case 2: + return detail::GetLane<2>(v); + case 3: + return detail::GetLane<3>(v); + case 4: + return detail::GetLane<4>(v); + case 5: + return detail::GetLane<5>(v); + case 6: + return detail::GetLane<6>(v); + case 7: + return detail::GetLane<7>(v); + case 8: + return detail::GetLane<8>(v); + case 9: + return detail::GetLane<9>(v); + case 10: + return detail::GetLane<10>(v); + case 11: + return detail::GetLane<11>(v); + case 12: + return detail::GetLane<12>(v); + case 13: + return detail::GetLane<13>(v); + case 14: + return detail::GetLane<14>(v); + case 15: + return detail::GetLane<15>(v); + } + } +#endif + alignas(16) T lanes[16]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane> +#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size> +#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \ + Vec128<type##_t, size> v, type##_t t +#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT) + +#undef HWY_NEON_BUILD_TPL_HWY_INSERT +#undef HWY_NEON_BUILD_RET_HWY_INSERT +#undef HWY_NEON_BUILD_PARAM_HWY_INSERT +#undef HWY_NEON_BUILD_ARG_HWY_INSERT + +} // namespace detail + +// Requires one overload per vector length because InsertLane<3> may be a +// compile error. + +template <typename T> +HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + return Set(DFromV<decltype(v)>(), t); +} + +template <typename T> +HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template <typename T> +HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[4]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template <typename T> +HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[8]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template <typename T> +HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + case 8: + return detail::InsertLane<8>(v, t); + case 9: + return detail::InsertLane<9>(v, t); + case 10: + return detail::InsertLane<10>(v, t); + case 11: + return detail::InsertLane<11>(v, t); + case 12: + return detail::InsertLane<12>(v, t); + case 13: + return detail::InsertLane<13>(v, t); + case 14: + return detail::InsertLane<14>(v, t); + case 15: + return detail::InsertLane<15>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[16]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition +HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2) + +// ------------------------------ Subtraction +HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2) + +// ------------------------------ SumsOf8 + +HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) { + return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw)))); +} +HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) { + return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw)))); +} + +// ------------------------------ SaturatedAdd + +#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB +#undef HWY_NATIVE_I32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB +#undef HWY_NATIVE_U32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB +#undef HWY_NATIVE_I64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I64_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB +#undef HWY_NATIVE_U64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U64_SATURATED_ADDSUB +#endif + +// Returns a + b clamped to the destination range. +HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedAdd, vqadd, _, 2) + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. +HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2) + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 +HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) + +// ------------------------------ Neg + +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below + +HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) { +#if HWY_ARCH_ARM_A64 + return Vec64<int64_t>(vneg_s64(v.raw)); +#else + return Zero(DFromV<decltype(v)>()) - v; +#endif +} + +HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) { +#if HWY_ARCH_ARM_A64 + return Vec128<int64_t>(vnegq_s64(v.raw)); +#else + return Zero(DFromV<decltype(v)>()) - v; +#endif +} + +// ------------------------------ ShiftLeft + +// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported). +#pragma push_macro("HWY_NEON_DEF_FUNCTION") +#undef HWY_NEON_DEF_FUNCTION +#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ + template <int kBits> \ + HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \ + return kBits == 0 ? v \ + : Vec128<type##_t, size>(HWY_NEON_EVAL( \ + prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \ + } + +HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored) + +HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored) +HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored) + +#pragma pop_macro("HWY_NEON_DEF_FUNCTION") + +// ------------------------------ RotateRight (ShiftRight, Or) +template <int kBits, typename T, size_t N> +HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a +// mechanism for checking for extensions to Armv8. + +// ------------------------------ Shl + +HWY_API Vec128<uint8_t> operator<<(Vec128<uint8_t> v, Vec128<uint8_t> bits) { + return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> +HWY_API Vec128<uint8_t, N> operator<<(Vec128<uint8_t, N> v, + Vec128<uint8_t, N> bits) { + return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); +} + +HWY_API Vec128<uint16_t> operator<<(Vec128<uint16_t> v, Vec128<uint16_t> bits) { + return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<uint16_t, N> operator<<(Vec128<uint16_t, N> v, + Vec128<uint16_t, N> bits) { + return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw))); +} + +HWY_API Vec128<uint32_t> operator<<(Vec128<uint32_t> v, Vec128<uint32_t> bits) { + return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw))); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<uint32_t, N> operator<<(Vec128<uint32_t, N> v, + Vec128<uint32_t, N> bits) { + return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw))); +} + +HWY_API Vec128<uint64_t> operator<<(Vec128<uint64_t> v, Vec128<uint64_t> bits) { + return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw))); +} +HWY_API Vec64<uint64_t> operator<<(Vec64<uint64_t> v, Vec64<uint64_t> bits) { + return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); +} + +HWY_API Vec128<int8_t> operator<<(Vec128<int8_t> v, Vec128<int8_t> bits) { + return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> +HWY_API Vec128<int8_t, N> operator<<(Vec128<int8_t, N> v, + Vec128<int8_t, N> bits) { + return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw)); +} + +HWY_API Vec128<int16_t> operator<<(Vec128<int16_t> v, Vec128<int16_t> bits) { + return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> operator<<(Vec128<int16_t, N> v, + Vec128<int16_t, N> bits) { + return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw)); +} + +HWY_API Vec128<int32_t> operator<<(Vec128<int32_t> v, Vec128<int32_t> bits) { + return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> +HWY_API Vec128<int32_t, N> operator<<(Vec128<int32_t, N> v, + Vec128<int32_t, N> bits) { + return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw)); +} + +HWY_API Vec128<int64_t> operator<<(Vec128<int64_t> v, Vec128<int64_t> bits) { + return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw)); +} +HWY_API Vec64<int64_t> operator<<(Vec64<int64_t> v, Vec64<int64_t> bits) { + return Vec64<int64_t>(vshl_s64(v.raw, bits.raw)); +} + +// ------------------------------ Shr (Neg) + +HWY_API Vec128<uint8_t> operator>>(Vec128<uint8_t> v, Vec128<uint8_t> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits)); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> +HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> v, + Vec128<uint8_t, N> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits)); +} + +HWY_API Vec128<uint16_t> operator>>(Vec128<uint16_t> v, Vec128<uint16_t> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits)); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> v, + Vec128<uint16_t, N> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits)); +} + +HWY_API Vec128<uint32_t> operator>>(Vec128<uint32_t> v, Vec128<uint32_t> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits)); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<uint32_t, N> operator>>(Vec128<uint32_t, N> v, + Vec128<uint32_t, N> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits)); +} + +HWY_API Vec128<uint64_t> operator>>(Vec128<uint64_t> v, Vec128<uint64_t> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits)); +} +HWY_API Vec64<uint64_t> operator>>(Vec64<uint64_t> v, Vec64<uint64_t> bits) { + const RebindToSigned<DFromV<decltype(v)>> di; + const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw; + return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits)); +} + +HWY_API Vec128<int8_t> operator>>(Vec128<int8_t> v, Vec128<int8_t> bits) { + return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> +HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v, + Vec128<int8_t, N> bits) { + return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw)); +} + +HWY_API Vec128<int16_t> operator>>(Vec128<int16_t> v, Vec128<int16_t> bits) { + return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v, + Vec128<int16_t, N> bits) { + return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw)); +} + +HWY_API Vec128<int32_t> operator>>(Vec128<int32_t> v, Vec128<int32_t> bits) { + return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> +HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v, + Vec128<int32_t, N> bits) { + return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw)); +} + +HWY_API Vec128<int64_t> operator>>(Vec128<int64_t> v, Vec128<int64_t> bits) { + return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw)); +} +HWY_API Vec64<int64_t> operator>>(Vec64<int64_t> v, Vec64<int64_t> bits) { + return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw)); +} + +// ------------------------------ ShiftLeftSame (Shl) + +template <typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) { + return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits)); +} +template <typename T, size_t N> +HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) { + return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits)); +} + +// ------------------------------ Int/float multiplication + +// Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*. +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif + +// All except ui64 +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2) +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2) + +// ------------------------------ Integer multiplication + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) { + int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); +#if HWY_ARCH_ARM_A64 + int32x4_t rhi = vmull_high_s16(a.raw, b.raw); +#else + int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); +#endif + return Vec128<int16_t>( + vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi))); +} +HWY_API Vec128<uint16_t> MulHigh(Vec128<uint16_t> a, Vec128<uint16_t> b) { + uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); +#if HWY_ARCH_ARM_A64 + uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); +#else + uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); +#endif + return Vec128<uint16_t>( + vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi))); +} + +template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { + int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw)); + return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo))); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw)); + return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo))); +} + +HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) { + return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw)); +} + +// ------------------------------ Floating-point division + +// Approximate reciprocal +HWY_API Vec128<float> ApproximateReciprocal(const Vec128<float> v) { + return Vec128<float>(vrecpeq_f32(v.raw)); +} +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { + return Vec128<float, N>(vrecpe_f32(v.raw)); +} + +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) +#else +// Not defined on armv7: approximate +namespace detail { + +HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep( + const Vec128<float> recip, const Vec128<float> divisor) { + return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw)); +} +template <size_t N> +HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep( + const Vec128<float, N> recip, Vec128<float, N> divisor) { + return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw)); +} + +} // namespace detail + +template <size_t N> +HWY_API Vec128<float, N> operator/(Vec128<float, N> a, Vec128<float, N> b) { + auto x = ApproximateReciprocal(b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + return a * x; +} +#endif + +// ------------------------------ Absolute value of difference. + +HWY_API Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) { + return Vec128<float>(vabdq_f32(a.raw, b.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>(vabd_f32(a.raw, b.raw)); +} + +#ifdef HWY_NATIVE_INTEGER_ABS_DIFF +#undef HWY_NATIVE_INTEGER_ABS_DIFF +#else +#define HWY_NATIVE_INTEGER_ABS_DIFF +#endif + +HWY_API Vec128<int8_t> AbsDiff(const Vec128<int8_t> a, const Vec128<int8_t> b) { + return Vec128<int8_t>(vabdq_s8(a.raw, b.raw)); +} + +HWY_API Vec128<uint8_t> AbsDiff(const Vec128<uint8_t> a, + const Vec128<uint8_t> b) { + return Vec128<uint8_t>(vabdq_u8(a.raw, b.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> +HWY_API Vec128<int8_t, N> AbsDiff(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>(vabd_s8(a.raw, b.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> +HWY_API Vec128<uint8_t, N> AbsDiff(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>(vabd_u8(a.raw, b.raw)); +} + +HWY_API Vec128<int16_t> AbsDiff(const Vec128<int16_t> a, + const Vec128<int16_t> b) { + return Vec128<int16_t>(vabdq_s16(a.raw, b.raw)); +} + +HWY_API Vec128<uint16_t> AbsDiff(const Vec128<uint16_t> a, + const Vec128<uint16_t> b) { + return Vec128<uint16_t>(vabdq_u16(a.raw, b.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<int16_t, N> AbsDiff(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>(vabd_s16(a.raw, b.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<uint16_t, N> AbsDiff(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>(vabd_u16(a.raw, b.raw)); +} + +HWY_API Vec128<int32_t> AbsDiff(const Vec128<int32_t> a, + const Vec128<int32_t> b) { + return Vec128<int32_t>(vabdq_s32(a.raw, b.raw)); +} + +HWY_API Vec128<uint32_t> AbsDiff(const Vec128<uint32_t> a, + const Vec128<uint32_t> b) { + return Vec128<uint32_t>(vabdq_u32(a.raw, b.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<int32_t, N> AbsDiff(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int32_t, N>(vabd_s32(a.raw, b.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<uint32_t, N> AbsDiff(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>(vabd_u32(a.raw, b.raw)); +} + +// ------------------------------ Integer multiply-add + +// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. +#ifdef HWY_NATIVE_INT_FMA +#undef HWY_NATIVE_INT_FMA +#else +#define HWY_NATIVE_INT_FMA +#endif + +// Wrappers for changing argument order to what intrinsics expect. +namespace detail { +// All except ui64 +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(MulAdd, vmla, _, 3) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(MulAdd, vmla, _, 3) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(NegMulAdd, vmls, _, 3) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(NegMulAdd, vmls, _, 3) +} // namespace detail + +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)> +HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return detail::MulAdd(add, mul, x); +} + +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)> +HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return detail::NegMulAdd(add, mul, x); +} + +// 64-bit integer +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return Add(Mul(mul, x), add); +} + +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return Sub(add, Mul(mul, x)); +} + +// ------------------------------ Floating-point multiply-add variants + +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 + +template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> add) { + return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128<float> MulAdd(Vec128<float> mul, Vec128<float> x, + Vec128<float> add) { + return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> add) { + return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128<float> NegMulAdd(Vec128<float> mul, Vec128<float> x, + Vec128<float> add) { + return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw)); +} + +#else // emulate + +template <size_t N> +HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> add) { + return mul * x + add; +} + +template <size_t N> +HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> add) { + return add - mul * x; +} + +#endif // defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 + +#if HWY_ARCH_ARM_A64 + +HWY_API Vec64<double> MulAdd(Vec64<double> mul, Vec64<double> x, + Vec64<double> add) { + return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128<double> MulAdd(Vec128<double> mul, Vec128<double> x, + Vec128<double> add) { + return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw)); +} + +HWY_API Vec64<double> NegMulAdd(Vec64<double> mul, Vec64<double> x, + Vec64<double> add) { + return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128<double> NegMulAdd(Vec128<double> mul, Vec128<double> x, + Vec128<double> add) { + return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw)); +} + +#endif // HWY_ARCH_ARM_A64 + +template <typename T, size_t N> +HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> sub) { + return MulAdd(mul, x, Neg(sub)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> sub) { + return Neg(MulAdd(mul, x, sub)); +} + +// ------------------------------ Floating-point square root (IfThenZeroElse) + +// Approximate reciprocal square root +HWY_API Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) { + return Vec128<float>(vrsqrteq_f32(v.raw)); +} +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) { + return Vec128<float, N>(vrsqrte_f32(v.raw)); +} + +// Full precision square root +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) +#else +namespace detail { + +HWY_INLINE Vec128<float> ReciprocalSqrtStep(Vec128<float> root, + Vec128<float> recip) { + return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw)); +} +template <size_t N> +HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(Vec128<float, N> root, + Vec128<float, N> recip) { + return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw)); +} + +} // namespace detail + +// Not defined on armv7: approximate +template <size_t N> +HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) { + auto recip = ApproximateReciprocalSqrt(v); + + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + + const auto root = v * recip; + return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root); +} +#endif + +// ================================================== LOGICAL + +// ------------------------------ Not + +// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. +template <typename T> +HWY_API Vec128<T> Not(const Vec128<T> v) { + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw))); +} +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> d8; + using V8 = decltype(Zero(d8)); + return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); +} + +// ------------------------------ And +HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2) + +// Uses the u32/64 defined above. +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, BitCast(du, a) & BitCast(du, b)); +} + +// ------------------------------ AndNot + +namespace detail { +// reversed_andnot returns a & ~b. +HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2) +} // namespace detail + +// Returns ~not_mask & mask. +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask, + const Vec128<T, N> mask) { + return detail::reversed_andnot(mask, not_mask); +} + +// Uses the u32/64 defined above. +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask, + const Vec128<T, N> mask) { + const DFromV<decltype(mask)> d; + const RebindToUnsigned<decltype(d)> du; + VFromD<decltype(du)> ret = + detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); + return BitCast(d, ret); +} + +// ------------------------------ Or + +HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2) + +// Uses the u32/64 defined above. +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, BitCast(du, a) | BitCast(du, b)); +} + +// ------------------------------ Xor + +HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2) + +// Uses the u32/64 defined above. +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, BitCast(du, a) ^ BitCast(du, b)); +} + +// ------------------------------ Xor3 +#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3) +HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3) + +// Half vectors are not natively supported. Two Xor are likely more efficient +// than Combine to 128-bit. +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { + return Xor(x1, Xor(x2, x3)); +} + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Xor3(const Vec128<T, N> x1, const Vec128<T, N> x2, + const Vec128<T, N> x3) { + const DFromV<decltype(x1)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3))); +} + +#else +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { + return Xor(x1, Xor(x2, x3)); +} +#endif + +// ------------------------------ Or3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd +template <typename T, size_t N> +HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse +template <typename T, size_t N> +HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ BitwiseIfThenElse + +#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#else +#define HWY_NATIVE_BITWISE_IF_THEN_ELSE +#endif + +template <class V> +HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { + return IfVecThenElse(mask, yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { + return And(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { + return Or(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { + return Xor(a, b); +} + +// ------------------------------ I64/U64 AbsDiff + +template <size_t N> +HWY_API Vec128<int64_t, N> AbsDiff(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Max(a, b) - Min(a, b); +} + +template <size_t N> +HWY_API Vec128<uint64_t, N> AbsDiff(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Or(SaturatedSub(a, b), SaturatedSub(b, a)); +} + +// ------------------------------ PopulationCount + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template <typename T> +HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec128<T> v) { + const Full128<uint8_t> d8; + return Vec128<T>(vcntq_u8(BitCast(d8, v).raw)); +} +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, + Vec128<T, N> v) { + const Simd<uint8_t, N, 0> d8; + return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw)); +} + +// NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes. +template <typename T> +HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec128<T> v) { + const Full128<uint8_t> d8; + const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); + return Vec128<T>(vpaddlq_u8(bytes)); +} +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, + Vec128<T, N> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d8; + const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); + return Vec128<T, N>(vpaddl_u8(bytes)); +} + +template <typename T> +HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec128<T> v) { + const Full128<uint8_t> d8; + const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); + return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes))); +} +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, + Vec128<T, N> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d8; + const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); + return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes))); +} + +template <typename T> +HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec128<T> v) { + const Full128<uint8_t> d8; + const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); + return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes)))); +} +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, + Vec128<T, N> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d8; + const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); + return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes)))); +} + +} // namespace detail + +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { + return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); +} + +// ================================================== SIGN + +// ------------------------------ Abs + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec128<int8_t> Abs(const Vec128<int8_t> v) { + return Vec128<int8_t>(vabsq_s8(v.raw)); +} +HWY_API Vec128<int16_t> Abs(const Vec128<int16_t> v) { + return Vec128<int16_t>(vabsq_s16(v.raw)); +} +HWY_API Vec128<int32_t> Abs(const Vec128<int32_t> v) { + return Vec128<int32_t>(vabsq_s32(v.raw)); +} +// i64 is implemented after BroadcastSignBit. +HWY_API Vec128<float> Abs(const Vec128<float> v) { + return Vec128<float>(vabsq_f32(v.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> +HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { + return Vec128<int8_t, N>(vabs_s8(v.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>(vabs_s16(v.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> +HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>(vabs_s32(v.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) { + return Vec128<float, N>(vabs_f32(v.raw)); +} + +#if HWY_ARCH_ARM_A64 +HWY_API Vec128<double> Abs(const Vec128<double> v) { + return Vec128<double>(vabsq_f64(v.raw)); +} + +HWY_API Vec64<double> Abs(const Vec64<double> v) { + return Vec64<double>(vabs_f64(v.raw)); +} +#endif + +// ------------------------------ CopySign + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + const auto msb = SignBit(DFromV<decltype(magn)>()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign)); +} + +// ------------------------------ BroadcastSignBit + +template <typename T, size_t N, HWY_IF_SIGNED(T)> +HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) { + return ShiftRight<sizeof(T) * 8 - 1>(v); +} + +// ================================================== MASK + +// ------------------------------ To/from vector + +// Mask and Vec have the same representation (true = FF..FF). +template <typename T, size_t N> +HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { + const Simd<MakeUnsigned<T>, N, 0> du; + return Mask128<T, N>(BitCast(du, v).raw); +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +template <class D> +HWY_API VFromD<D> VecFromMask(D d, const MFromD<D> m) { + // Raw type of masks is unsigned. + const RebindToUnsigned<D> du; + return BitCast(d, VFromD<decltype(du)>(m.raw)); +} + +// ------------------------------ RebindMask (MaskFromVec) + +template <typename TFrom, size_t NFrom, class DTo> +HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); + return MFromD<DTo>(m.raw); +} + +// ------------------------------ IfThenElse + +#define HWY_NEON_BUILD_TPL_HWY_IF +#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size> +#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ + const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \ + const Vec128<type##_t, size> no +#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF) + +#undef HWY_NEON_BUILD_TPL_HWY_IF +#undef HWY_NEON_BUILD_RET_HWY_IF +#undef HWY_NEON_BUILD_PARAM_HWY_IF +#undef HWY_NEON_BUILD_ARG_HWY_IF + +// mask ? yes : 0 +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { + return yes & VecFromMask(DFromV<decltype(yes)>(), mask); +} + +// mask ? 0 : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { + return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, + Vec128<T, N> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + const DFromV<decltype(no)> d; + const RebindToSigned<decltype(d)> di; + + Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(m, yes, no); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { + const auto zero = Zero(DFromV<decltype(v)>()); + return Max(zero, v); +} + +// ------------------------------ Mask logical + +template <typename T, size_t N> +HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { + return MaskFromVec(Not(VecFromMask(DFromM<decltype(m)>(), m))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +// ------------------------------ Shuffle2301 (for i64 compares) + +// Swap 32-bit halves in 64-bits +HWY_API Vec64<uint32_t> Shuffle2301(const Vec64<uint32_t> v) { + return Vec64<uint32_t>(vrev64_u32(v.raw)); +} +HWY_API Vec64<int32_t> Shuffle2301(const Vec64<int32_t> v) { + return Vec64<int32_t>(vrev64_s32(v.raw)); +} +HWY_API Vec64<float> Shuffle2301(const Vec64<float> v) { + return Vec64<float>(vrev64_f32(v.raw)); +} +HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) { + return Vec128<uint32_t>(vrev64q_u32(v.raw)); +} +HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) { + return Vec128<int32_t>(vrev64q_s32(v.raw)); +} +HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) { + return Vec128<float>(vrev64q_f32(v.raw)); +} + +#define HWY_NEON_BUILD_TPL_HWY_COMPARE +#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size> +#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ + const Vec128<type##_t, size> a, const Vec128<type##_t, size> b +#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw + +// ------------------------------ Equality +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) +#else +// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) +#endif + +// ------------------------------ Strict inequality (signed, float) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE) +#else +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) +#endif +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) + +// ------------------------------ Weak inequality (float) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE) +#else +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE) +#endif +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) + +#undef HWY_NEON_BUILD_TPL_HWY_COMPARE +#undef HWY_NEON_BUILD_RET_HWY_COMPARE +#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE +#undef HWY_NEON_BUILD_ARG_HWY_COMPARE + +// ------------------------------ Armv7 i64 compare (Shuffle2301, Eq) + +#if HWY_ARCH_ARM_V7 + +template <size_t N> +HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + const Simd<int32_t, N * 2, 0> d32; + const Simd<int64_t, N, 0> d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +} + +template <size_t N> +HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + const Simd<uint32_t, N * 2, 0> d32; + const Simd<uint64_t, N, 0> d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +} + +HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a, + const Vec128<int64_t> b) { + const int64x2_t sub = vqsubq_s64(a.raw, b.raw); + return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub))); +} +HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a, + const Vec64<int64_t> b) { + const int64x1_t sub = vqsub_s64(a.raw, b.raw); + return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub))); +} + +template <size_t N> +HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + const DFromV<decltype(a)> du; + const RebindToSigned<decltype(du)> di; + const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b); + return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb)))); +} + +template <size_t N> +HWY_API Mask128<int64_t, N> operator<=(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Not(b < a); +} + +template <size_t N> +HWY_API Mask128<uint64_t, N> operator<=(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Not(b < a); +} + +#endif + +// ------------------------------ operator!= (operator==) + +// Customize HWY_NEON_DEF_FUNCTION to call 2 functions. +#pragma push_macro("HWY_NEON_DEF_FUNCTION") +#undef HWY_NEON_DEF_FUNCTION +// This cannot have _any_ template argument (in x86_128 we can at least have N +// as an argument), otherwise it is not more specialized than rewritten +// operator== in C++20, leading to compile errors. +#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ + HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \ + Vec128<type##_t, size> b) { \ + return Not(a == b); \ + } + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored) + +#pragma pop_macro("HWY_NEON_DEF_FUNCTION") + +// ------------------------------ Reversed comparisons + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { + return operator<(b, a); +} +template <typename T, size_t N> +HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { + return operator<=(b, a); +} + +// ------------------------------ FirstN (Iota, Lt) + +template <class D> +HWY_API MFromD<D> FirstN(D d, size_t num) { + const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. + using TI = TFromD<decltype(di)>; + return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num))); +} + +// ------------------------------ TestBit (Eq) + +#define HWY_NEON_BUILD_TPL_HWY_TESTBIT +#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size> +#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ + Vec128<type##_t, size> v, Vec128<type##_t, size> bit +#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw + +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) +#else +// No 64-bit versions on armv7 +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) + +template <size_t N> +HWY_API Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v, + Vec128<uint64_t, N> bit) { + return (v & bit) == bit; +} +template <size_t N> +HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v, + Vec128<int64_t, N> bit) { + return (v & bit) == bit; +} + +#endif +#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT +#undef HWY_NEON_BUILD_RET_HWY_TESTBIT +#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT +#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT + +// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) +HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) { +#if HWY_ARCH_ARM_A64 + return Vec128<int64_t>(vabsq_s64(v.raw)); +#else + const auto zero = Zero(DFromV<decltype(v)>()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} +HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) { +#if HWY_ARCH_ARM_A64 + return Vec64<int64_t>(vabs_s64(v.raw)); +#else + const auto zero = Zero(DFromV<decltype(v)>()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +// ------------------------------ Min (IfThenElse, BroadcastSignBit) + +// Unsigned +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) + +template <size_t N> +HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, b, a); +#else + const DFromV<decltype(a)> du; + const RebindToSigned<decltype(du)> di; + return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b))); +#endif +} + +// Signed +HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2) + +template <size_t N> +HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, b, a); +#else + const Vec128<int64_t, N> sign = SaturatedSub(a, b); + return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); +#endif +} + +// Float: IEEE minimumNumber on v8 +#if HWY_ARCH_ARM_A64 + +HWY_NEON_DEF_FUNCTION_FLOAT_32(Min, vminnm, _, 2) + +// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define +// in terms of the 128-bit intrinsic. +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 +namespace detail { + +template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)> +HWY_INLINE V F64Vec64Min(V a, V b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); +} + +} // namespace detail +#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + +HWY_API Vec64<double> Min(Vec64<double> a, Vec64<double> b) { +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + return detail::F64Vec64Min(a, b); +#else + return Vec64<double>(vminnm_f64(a.raw, b.raw)); +#endif +} + +HWY_API Vec128<double> Min(Vec128<double> a, Vec128<double> b) { + return Vec128<double>(vminnmq_f64(a.raw, b.raw)); +} + +#else +// Armv7: NaN if any is NaN. +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) +#endif // HWY_ARCH_ARM_A64 + +// ------------------------------ Max (IfThenElse, BroadcastSignBit) + +// Unsigned (no u64) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2) + +template <size_t N> +HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, a, b); +#else + const DFromV<decltype(a)> du; + const RebindToSigned<decltype(du)> di; + return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b))); +#endif +} + +// Signed (no i64) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2) + +template <size_t N> +HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, a, b); +#else + const Vec128<int64_t, N> sign = SaturatedSub(a, b); + return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); +#endif +} + +// Float: IEEE minimumNumber on v8 +#if HWY_ARCH_ARM_A64 + +HWY_NEON_DEF_FUNCTION_FLOAT_32(Max, vmaxnm, _, 2) + +// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define +// in terms of the 128-bit intrinsic. +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 +namespace detail { + +template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)> +HWY_INLINE V F64Vec64Max(V a, V b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); +} + +} // namespace detail +#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + +HWY_API Vec64<double> Max(Vec64<double> a, Vec64<double> b) { +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + return detail::F64Vec64Max(a, b); +#else + return Vec64<double>(vmaxnm_f64(a.raw, b.raw)); +#endif +} + +HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) { + return Vec128<double>(vmaxnmq_f64(a.raw, b.raw)); +} + +#else +// Armv7: NaN if any is NaN. +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) +#endif // HWY_ARCH_ARM_A64 + +// ================================================== MEMORY + +// ------------------------------ Load 128 + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> LoadU(D /* tag */, + const uint8_t* HWY_RESTRICT unaligned) { + return Vec128<uint8_t>(vld1q_u8(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> LoadU(D /* tag */, + const uint16_t* HWY_RESTRICT unaligned) { + return Vec128<uint16_t>(vld1q_u16(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> LoadU(D /* tag */, + const uint32_t* HWY_RESTRICT unaligned) { + return Vec128<uint32_t>(vld1q_u32(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> +HWY_API Vec128<uint64_t> LoadU(D /* tag */, + const uint64_t* HWY_RESTRICT unaligned) { + return Vec128<uint64_t>(vld1q_u64(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> LoadU(D /* tag */, + const int8_t* HWY_RESTRICT unaligned) { + return Vec128<int8_t>(vld1q_s8(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> LoadU(D /* tag */, + const int16_t* HWY_RESTRICT unaligned) { + return Vec128<int16_t>(vld1q_s16(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> LoadU(D /* tag */, + const int32_t* HWY_RESTRICT unaligned) { + return Vec128<int32_t>(vld1q_s32(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> LoadU(D /* tag */, + const int64_t* HWY_RESTRICT unaligned) { + return Vec128<int64_t>(vld1q_s64(unaligned)); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) { + return Vec128<float>(vld1q_f32(unaligned)); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API Vec128<double> LoadU(D /* tag */, + const double* HWY_RESTRICT unaligned) { + return Vec128<double>(vld1q_f64(unaligned)); +} +#endif + +// ------------------------------ Load 64 + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { + return Vec64<uint8_t>(vld1_u8(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { + return Vec64<uint16_t>(vld1_u16(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) { + return Vec64<uint32_t>(vld1_u32(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)> +HWY_API Vec64<uint64_t> LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) { + return Vec64<uint64_t>(vld1_u64(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { + return Vec64<int8_t>(vld1_s8(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { + return Vec64<int16_t>(vld1_s16(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) { + return Vec64<int32_t>(vld1_s32(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)> +HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) { + return Vec64<int64_t>(vld1_s64(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { + return Vec64<float>(vld1_f32(p)); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> +HWY_API Vec64<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { + return Vec64<double>(vld1_f64(p)); +} +#endif +// ------------------------------ Load 32 + +// Actual 32-bit broadcast load - used to implement the other lane types +// because reinterpret_cast of the pointer leads to incorrect codegen on GCC. +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)> +HWY_API Vec32<uint32_t> LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) { + return Vec32<uint32_t>(vld1_dup_u32(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)> +HWY_API Vec32<int32_t> LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) { + return Vec32<int32_t>(vld1_dup_s32(p)); +} +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> +HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) { + return Vec32<float>(vld1_dup_f32(p)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_NOT_SPECIAL_FLOAT_D(D), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + const Repartition<uint32_t, decltype(d)> d32; + uint32_t buf; + CopyBytes<4>(p, &buf); + return BitCast(d, LoadU(d32, &buf)); +} + +// ------------------------------ Load 16 + +// Actual 16-bit broadcast load - used to implement the other lane types +// because reinterpret_cast of the pointer leads to incorrect codegen on GCC. +template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U16_D(D)> +HWY_API VFromD<D> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { + return VFromD<D>(vld1_dup_u16(p)); +} +template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)> +HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { + return VFromD<D>(vld1_dup_s16(p)); +} + +// 8-bit x2 +template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + const Repartition<uint16_t, decltype(d)> d16; + uint16_t buf; + CopyBytes<2>(p, &buf); + return BitCast(d, LoadU(d16, &buf)); +} + +// ------------------------------ Load 8 +template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U8_D(D)> +HWY_API VFromD<D> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { + return VFromD<D>(vld1_dup_u8(p)); +} +template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I8_D(D)> +HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { + return VFromD<D>(vld1_dup_s8(p)); +} + +// ------------------------------ Load misc + +// [b]float16_t use the same Raw as uint16_t, so forward to that. +template <class D, HWY_IF_F16_D(D)> +HWY_API VFromD<D> LoadU(D d, const float16_t* HWY_RESTRICT p) { + const RebindToUnsigned<decltype(d)> du16; + const auto pu16 = reinterpret_cast<const uint16_t*>(p); + return VFromD<D>(LoadU(du16, pu16).raw); +} +template <class D, HWY_IF_BF16_D(D)> +HWY_API VFromD<D> LoadU(D d, const bfloat16_t* HWY_RESTRICT p) { + const RebindToUnsigned<decltype(d)> du16; + const auto pu16 = reinterpret_cast<const uint16_t*>(p); + return VFromD<D>(LoadU(du16, pu16).raw); +} + +// On Arm, Load is the same as LoadU. +template <class D> +HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { + return LoadU(d, p); +} + +template <class D> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, + const TFromD<D>* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +template <class D> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, + const TFromD<D>* HWY_RESTRICT aligned) { + return IfThenElse(m, Load(d, aligned), v); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { + return LoadU(d, p); +} + +// ------------------------------ Store 128 + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> +HWY_API void StoreU(Vec128<uint8_t> v, D /* tag */, + uint8_t* HWY_RESTRICT unaligned) { + vst1q_u8(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> +HWY_API void StoreU(Vec128<uint16_t> v, D /* tag */, + uint16_t* HWY_RESTRICT unaligned) { + vst1q_u16(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API void StoreU(Vec128<uint32_t> v, D /* tag */, + uint32_t* HWY_RESTRICT unaligned) { + vst1q_u32(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> +HWY_API void StoreU(Vec128<uint64_t> v, D /* tag */, + uint64_t* HWY_RESTRICT unaligned) { + vst1q_u64(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> +HWY_API void StoreU(Vec128<int8_t> v, D /* tag */, + int8_t* HWY_RESTRICT unaligned) { + vst1q_s8(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> +HWY_API void StoreU(Vec128<int16_t> v, D /* tag */, + int16_t* HWY_RESTRICT unaligned) { + vst1q_s16(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API void StoreU(Vec128<int32_t> v, D /* tag */, + int32_t* HWY_RESTRICT unaligned) { + vst1q_s32(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API void StoreU(Vec128<int64_t> v, D /* tag */, + int64_t* HWY_RESTRICT unaligned) { + vst1q_s64(unaligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API void StoreU(Vec128<float> v, D /* tag */, + float* HWY_RESTRICT unaligned) { + vst1q_f32(unaligned, v.raw); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API void StoreU(Vec128<double> v, D /* tag */, + double* HWY_RESTRICT unaligned) { + vst1q_f64(unaligned, v.raw); +} +#endif + +// ------------------------------ Store 64 + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API void StoreU(Vec64<uint8_t> v, D /* tag */, uint8_t* HWY_RESTRICT p) { + vst1_u8(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API void StoreU(Vec64<uint16_t> v, D /* tag */, uint16_t* HWY_RESTRICT p) { + vst1_u16(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API void StoreU(Vec64<uint32_t> v, D /* tag */, uint32_t* HWY_RESTRICT p) { + vst1_u32(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)> +HWY_API void StoreU(Vec64<uint64_t> v, D /* tag */, uint64_t* HWY_RESTRICT p) { + vst1_u64(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> +HWY_API void StoreU(Vec64<int8_t> v, D /* tag */, int8_t* HWY_RESTRICT p) { + vst1_s8(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> +HWY_API void StoreU(Vec64<int16_t> v, D /* tag */, int16_t* HWY_RESTRICT p) { + vst1_s16(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API void StoreU(Vec64<int32_t> v, D /* tag */, int32_t* HWY_RESTRICT p) { + vst1_s32(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)> +HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) { + vst1_s64(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) { + vst1_f32(p, v.raw); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> +HWY_API void StoreU(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) { + vst1_f64(p, v.raw); +} +#endif + +// ------------------------------ Store 32 + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)> +HWY_API void StoreU(Vec32<uint32_t> v, D, uint32_t* HWY_RESTRICT p) { + vst1_lane_u32(p, v.raw, 0); +} +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)> +HWY_API void StoreU(Vec32<int32_t> v, D, int32_t* HWY_RESTRICT p) { + vst1_lane_s32(p, v.raw, 0); +} +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> +HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) { + vst1_lane_f32(p, v.raw, 0); +} + +// Overload 16-bit types directly to avoid ambiguity with [b]float16_t. +template <class D, HWY_IF_V_SIZE_D(D, 4), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API void StoreU(Vec32<T> v, D d, T* HWY_RESTRICT p) { + Repartition<uint32_t, decltype(d)> d32; + uint32_t buf = GetLane(BitCast(d32, v)); + CopyBytes<4>(&buf, p); +} + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)> +HWY_API void StoreU(Vec32<uint16_t> v, D d, uint16_t* HWY_RESTRICT p) { + Repartition<uint32_t, decltype(d)> d32; + uint32_t buf = GetLane(BitCast(d32, v)); + CopyBytes<4>(&buf, p); +} + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)> +HWY_API void StoreU(Vec32<int16_t> v, D d, int16_t* HWY_RESTRICT p) { + Repartition<uint32_t, decltype(d)> d32; + uint32_t buf = GetLane(BitCast(d32, v)); + CopyBytes<4>(&buf, p); +} + +// ------------------------------ Store 16 + +template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U16_D(D)> +HWY_API void StoreU(Vec16<uint16_t> v, D, uint16_t* HWY_RESTRICT p) { + vst1_lane_u16(p, v.raw, 0); +} +template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)> +HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) { + vst1_lane_s16(p, v.raw, 0); +} + +template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> +HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { + const Repartition<uint16_t, decltype(d)> d16; + const uint16_t buf = GetLane(BitCast(d16, v)); + CopyBytes<2>(&buf, p); +} + +// ------------------------------ Store 8 + +template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_U8_D(D)> +HWY_API void StoreU(Vec128<uint8_t, 1> v, D, uint8_t* HWY_RESTRICT p) { + vst1_lane_u8(p, v.raw, 0); +} +template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_I8_D(D)> +HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) { + vst1_lane_s8(p, v.raw, 0); +} + +// [b]float16_t use the same Raw as uint16_t, so forward to that. +template <class D, HWY_IF_F16_D(D)> +HWY_API void StoreU(VFromD<D> v, D d, float16_t* HWY_RESTRICT p) { + const RebindToUnsigned<decltype(d)> du16; + const auto pu16 = reinterpret_cast<uint16_t*>(p); + return StoreU(BitCast(du16, v), du16, pu16); +} +template <class D, HWY_IF_BF16_D(D)> +HWY_API void StoreU(VFromD<D> v, D d, bfloat16_t* HWY_RESTRICT p) { + const RebindToUnsigned<decltype(d)> du16; + const auto pu16 = reinterpret_cast<uint16_t*>(p); + return StoreU(BitCast(du16, v), du16, pu16); +} + +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") +#endif + +// On Arm, Store is the same as StoreU. +template <class D> +HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { + StoreU(v, d, aligned); +} + +HWY_DIAGNOSTICS(pop) + +template <class D> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + // Treat as unsigned so that we correctly support float16. + const RebindToUnsigned<decltype(d)> du; + const auto blended = + IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p))); + StoreU(BitCast(d, blended), d, p); +} + +// ------------------------------ Non-temporal stores + +// Same as aligned stores on non-x86. + +template <class D> +HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { +#if HWY_ARCH_ARM_A64 +#if HWY_COMPILER_GCC + __builtin_prefetch(aligned, 1, 0); +#elif HWY_COMPILER_MSVC + __prefetch2(aligned, 0x11); +#endif +#endif + Store(v, d, aligned); +} + +// ================================================== CONVERT + +// ------------------------------ ConvertTo + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<int32_t> v) { + return Vec128<float>(vcvtq_f32_s32(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToSigned<D>> v) { + return VFromD<D>(vcvt_f32_s32(v.raw)); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<uint32_t> v) { + return Vec128<float>(vcvtq_f32_u32(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) { + return VFromD<D>(vcvt_f32_u32(v.raw)); +} + +// Truncates (rounds toward zero). +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> ConvertTo(D /* tag */, Vec128<float> v) { + return Vec128<int32_t>(vcvtq_s32_f32(v.raw)); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) { + return VFromD<D>(vcvt_s32_f32(v.raw)); +} + +#if HWY_ARCH_ARM_A64 + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<int64_t> v) { + return Vec128<double>(vcvtq_f64_s64(v.raw)); +} +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) { +// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + return Set(Full64<double>(), static_cast<double>(GetLane(v))); +#else + return Vec64<double>(vcvt_f64_s64(v.raw)); +#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) { + return Vec128<double>(vcvtq_f64_u64(v.raw)); +} +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) { + return Vec64<double>(vcvt_f64_u64(v.raw)); +} + +// Truncates (rounds toward zero). +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> ConvertTo(D /* tag */, Vec128<double> v) { + return Vec128<int64_t>(vcvtq_s64_f64(v.raw)); +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec64<int64_t> ConvertTo(D di, Vec64<double> v) { + // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the + // 128-bit version to avoid UB from casting double -> int64_t. +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + const Full128<double> ddt; + const Twice<decltype(di)> dit; + return LowerHalf(di, ConvertTo(dit, Combine(ddt, v, v))); +#else + (void)di; + return Vec64<int64_t>(vcvt_s64_f64(v.raw)); +#endif +} + +#endif + +// ------------------------------ PromoteTo (ConvertTo) + +// Unsigned: zero-extend to full vector. +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> PromoteTo(D /* tag */, Vec64<uint8_t> v) { + return Vec128<uint16_t>(vmovl_u8(v.raw)); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec32<uint8_t> v) { + uint16x8_t a = vmovl_u8(v.raw); + return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec64<uint16_t> v) { + return Vec128<uint32_t>(vmovl_u16(v.raw)); +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec128<uint64_t> PromoteTo(D /* tag */, Vec64<uint32_t> v) { + return Vec128<uint64_t>(vmovl_u32(v.raw)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> PromoteTo(D d, Vec64<uint8_t> v) { + return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw))); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteTo(D d, Vec32<uint8_t> v) { + uint16x8_t a = vmovl_u8(v.raw); + return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)))); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteTo(D d, Vec64<uint16_t> v) { + return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw))); +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> PromoteTo(D d, Vec64<uint32_t> v) { + return BitCast(d, Vec128<uint64_t>(vmovl_u32(v.raw))); +} + +// Unsigned: zero-extend to half vector. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + return VFromD<D>(vget_low_u16(vmovl_u8(v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + return VFromD<D>(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw))))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + return VFromD<D>(vget_low_u32(vmovl_u16(v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + return VFromD<D>(vget_low_u64(vmovl_u32(v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> +HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { + using VU16 = VFromD<RebindToUnsigned<D>>; + return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw)))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw))); + return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(u32))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw)))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) { + using DU = RebindToUnsigned<D>; + return BitCast(d, VFromD<DU>(vget_low_u64(vmovl_u32(v.raw)))); +} + +// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to +// TFromD<D> +template <class D, class V, HWY_IF_UI64_D(D), + HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> PromoteTo(D d, V v) { + const Rebind<uint32_t, decltype(d)> du32; + return PromoteTo(d, PromoteTo(du32, v)); +} + +// Signed: replicate sign bit to full vector. +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> PromoteTo(D /* tag */, Vec64<int8_t> v) { + return Vec128<int16_t>(vmovl_s8(v.raw)); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec32<int8_t> v) { + int16x8_t a = vmovl_s8(v.raw); + return Vec128<int32_t>(vmovl_s16(vget_low_s16(a))); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec64<int16_t> v) { + return Vec128<int32_t>(vmovl_s16(v.raw)); +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> PromoteTo(D /* tag */, Vec64<int32_t> v) { + return Vec128<int64_t>(vmovl_s32(v.raw)); +} + +// Signed: replicate sign bit to half vector. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { + return VFromD<D>(vget_low_s16(vmovl_s8(v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { + return VFromD<D>(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw))))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>(vget_low_s32(vmovl_s16(v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>(vget_low_s64(vmovl_s32(v.raw))); +} + +// I8/I16 to I64: First, promote to I32, and then promote to I64 +template <class D, class V, HWY_IF_I64_D(D), + HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> PromoteTo(D d, V v) { + const Rebind<int32_t, decltype(d)> di32; + return PromoteTo(d, PromoteTo(di32, v)); +} + +#if __ARM_FP & 2 + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> PromoteTo(D /* tag */, Vec64<float16_t> v) { + return Vec128<float>(vcvt_f32_f16(vreinterpret_f16_u16(v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) { + return VFromD<D>(vget_low_f32(vcvt_f32_f16(vreinterpret_f16_u16(v.raw)))); +} + +#else + +template <class D, HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + const RebindToUnsigned<decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + // Expand to u32 so we can shift. + const VU32 bits16 = PromoteTo(du32, BitCast(du16, v)); + const VU32 sign = ShiftRight<15>(bits16); + const VU32 biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const VU32 mantissa = bits16 & Set(du32, 0x3FF); + const VU32 subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const VU32 biased_exp32 = biased_exp + Set(du32, 127 - 15); + const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa); + const VU32 normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const VU32 bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +} + +#endif + +#if HWY_ARCH_ARM_A64 + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<float> v) { + return Vec128<double>(vcvt_f64_f32(v.raw)); +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec64<double> PromoteTo(D /* tag */, Vec32<float> v) { + return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw))); +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<int32_t> v) { + const int64x2_t i64 = vmovl_s32(v.raw); + return Vec128<double>(vcvtq_f64_s64(i64)); +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec64<double> PromoteTo(D d, Vec32<int32_t> v) { + return ConvertTo(d, Vec64<int64_t>(vget_low_s64(vmovl_s32(v.raw)))); +} + +#endif + +// ------------------------------ DemoteTo (ConvertTo) + +// From full vector to half or quarter +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { + return Vec64<uint16_t>(vqmovun_s32(v.raw)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { + return Vec64<int16_t>(vqmovn_s32(v.raw)); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { + const uint16x4_t a = vqmovun_s32(v.raw); + return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a))); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) { + return Vec64<uint8_t>(vqmovun_s16(v.raw)); +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec32<int8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { + const int16x4_t a = vqmovn_s32(v.raw); + return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a))); +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) { + return Vec64<int8_t>(vqmovn_s16(v.raw)); +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) { + return Vec64<uint16_t>(vqmovn_u32(v.raw)); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) { + const uint16x4_t a = vqmovn_u32(v.raw); + return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a))); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<uint16_t> v) { + return Vec64<uint8_t>(vqmovn_u16(v.raw)); +} + +// From half vector to partial half +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>(vqmovun_s32(vcombine_s32(v.raw, v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>(vqmovn_s32(vcombine_s32(v.raw, v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw)); + return VFromD<D>(vqmovn_u16(vcombine_u16(a, a))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>(vqmovun_s16(vcombine_s16(v.raw, v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw)); + return VFromD<D>(vqmovn_s16(vcombine_s16(a, a))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>(vqmovn_s16(vcombine_s16(v.raw, v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + return VFromD<D>(vqmovn_u32(vcombine_u32(v.raw, v.raw))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw)); + return VFromD<D>(vqmovn_u16(vcombine_u16(a, a))); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + return VFromD<D>(vqmovn_u16(vcombine_u16(v.raw, v.raw))); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) { + return Vec64<int32_t>(vqmovn_s64(v.raw)); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) { + return Vec64<uint32_t>(vqmovun_s64(v.raw)); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<uint64_t> v) { + return Vec64<uint32_t>(vqmovn_u64(v.raw)); +} +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), + HWY_IF_SIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D d, Vec128<uint64_t> v) { + const Rebind<int32_t, D> di32; + return DemoteTo(d, DemoteTo(di32, v)); +} +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), + HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) { + const Rebind<uint32_t, D> du32; + return DemoteTo(d, DemoteTo(du32, v)); +} +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), + HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D d, Vec128<uint64_t> v) { + const Rebind<uint32_t, D> du32; + return DemoteTo(d, DemoteTo(du32, v)); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) { + return Vec32<int32_t>(vqmovn_s64(vcombine_s64(v.raw, v.raw))); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) { + return Vec32<uint32_t>(vqmovun_s64(vcombine_s64(v.raw, v.raw))); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<uint64_t> v) { + return Vec32<uint32_t>(vqmovn_u64(vcombine_u64(v.raw, v.raw))); +} +template <class D, HWY_IF_SIGNED_D(D), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) { + const Rebind<int32_t, D> di32; + return DemoteTo(d, DemoteTo(di32, v)); +} +template <class D, HWY_IF_UNSIGNED_D(D), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) { + const Rebind<uint32_t, D> du32; + return DemoteTo(d, DemoteTo(du32, v)); +} +template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_UNSIGNED_D(D), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) { + const Rebind<uint32_t, D> du32; + return DemoteTo(d, DemoteTo(du32, v)); +} + +#if __ARM_FP & 2 + +template <class D, HWY_IF_F16_D(D)> +HWY_API Vec64<float16_t> DemoteTo(D /* tag */, Vec128<float> v) { + return Vec64<float16_t>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { + const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw)); + return VFromD<D>(vreinterpret_u16_f16(f16)); +} + +#else + +template <class D, HWY_IF_F16_D(D)> +HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { + const RebindToUnsigned<decltype(df16)> du16; + const Rebind<uint32_t, decltype(du16)> du; + const RebindToSigned<decltype(du)> di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return VFromD<D>(DemoteTo(du16, bits16).raw); +} + +#endif + +template <class D, HWY_IF_BF16_D(D)> +HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { + const Rebind<int32_t, decltype(dbf16)> di32; + const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right + const Rebind<uint16_t, decltype(dbf16)> du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +#if HWY_ARCH_ARM_A64 + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec64<float> DemoteTo(D /* tag */, Vec128<double> v) { + return Vec64<float>(vcvt_f32_f64(v.raw)); +} +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) { + return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw))); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) { + const int64x2_t i64 = vcvtq_s64_f64(v.raw); + return Vec64<int32_t>(vqmovn_s64(i64)); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) { + // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the + // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4. + const Full128<double> ddt; + const Full128<int64_t> dit; + return Vec32<int32_t>(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw)); +} + +#endif + +HWY_API Vec32<uint8_t> U8FromU32(Vec128<uint32_t> v) { + const uint8x16_t org_v = detail::BitCastToByte(v).raw; + const uint8x16_t w = vuzp1q_u8(org_v, org_v); + return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w))); +} +template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) { + const uint8x8_t org_v = detail::BitCastToByte(v).raw; + const uint8x8_t w = vuzp1_u8(org_v, org_v); + return Vec128<uint8_t, N>(vuzp1_u8(w, w)); +} + +// ------------------------------ Round (IfThenElse, mask, logical) + +#if HWY_ARCH_ARM_A64 +// Toward nearest integer +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) + +// Toward zero, aka truncate +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1) + +// Toward +infinity, aka ceiling +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1) + +// Toward -infinity, aka floor +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1) +#else + +// ------------------------------ Trunc + +// Armv7 only supports truncation to integer. We can either convert back to +// float (3 floating-point and 2 logic operations) or manipulate the binary32 +// representation, clearing the lowest 23-exp mantissa bits. This requires 9 +// integer operations and 3 constants, which is likely more expensive. + +namespace detail { + +// The original value is already the desired result if NaN or the magnitude is +// large (i.e. the value is already an integer). +template <size_t N> +HWY_INLINE Mask128<float, N> UseInt(const Vec128<float, N> v) { + return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>()); +} + +} // namespace detail + +template <size_t N> +HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), int_f, v); +} + +template <size_t N> +HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { + const DFromV<decltype(v)> df; + + // Armv7 also lacks a native NearestInt, but we can instead rely on rounding + // (we assume the current mode is nearest-even) after addition with a large + // value such that no mantissa bits remain. We may need a compiler flag for + // precise floating-point to prevent this from being "optimized" out. + const auto max = Set(df, MantissaEnd<float>()); + const auto large = CopySignToAbs(max, v); + const auto added = large + v; + const auto rounded = added - large; + + // Keep original if NaN or the magnitude is large (already an int). + return IfThenElse(Abs(v) < max, rounded, v); +} + +template <size_t N> +HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a positive non-integer ends up smaller; if so, add 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); + + return IfThenElse(detail::UseInt(v), int_f - neg1, v); +} + +template <size_t N> +HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a negative non-integer ends up larger; if so, subtract 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); + + return IfThenElse(detail::UseInt(v), int_f + neg1, v); +} + +#endif + +// ------------------------------ NearestInt (Round) + +#if HWY_ARCH_ARM_A64 + +HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) { + return Vec128<int32_t>(vcvtnq_s32_f32(v.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { + return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw)); +} + +#else + +template <size_t N> +HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return ConvertTo(di, Round(v)); +} + +#endif + +// ------------------------------ Floating-point classification +template <typename T, size_t N> +HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { + return v != v; +} + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>()))); +} + +// Returns whether normal/subnormal/zero. +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); +} + +// ================================================== SWIZZLE + +// ------------------------------ LowerHalf + +// <= 64 bit: just return different type +template <typename T, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> +HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { + return Vec128<T, N / 2>(v.raw); +} + +HWY_API Vec64<uint8_t> LowerHalf(Vec128<uint8_t> v) { + return Vec64<uint8_t>(vget_low_u8(v.raw)); +} +HWY_API Vec64<uint16_t> LowerHalf(Vec128<uint16_t> v) { + return Vec64<uint16_t>(vget_low_u16(v.raw)); +} +HWY_API Vec64<uint32_t> LowerHalf(Vec128<uint32_t> v) { + return Vec64<uint32_t>(vget_low_u32(v.raw)); +} +HWY_API Vec64<uint64_t> LowerHalf(Vec128<uint64_t> v) { + return Vec64<uint64_t>(vget_low_u64(v.raw)); +} +HWY_API Vec64<int8_t> LowerHalf(Vec128<int8_t> v) { + return Vec64<int8_t>(vget_low_s8(v.raw)); +} +HWY_API Vec64<int16_t> LowerHalf(Vec128<int16_t> v) { + return Vec64<int16_t>(vget_low_s16(v.raw)); +} +HWY_API Vec64<int32_t> LowerHalf(Vec128<int32_t> v) { + return Vec64<int32_t>(vget_low_s32(v.raw)); +} +HWY_API Vec64<int64_t> LowerHalf(Vec128<int64_t> v) { + return Vec64<int64_t>(vget_low_s64(v.raw)); +} +HWY_API Vec64<float> LowerHalf(Vec128<float> v) { + return Vec64<float>(vget_low_f32(v.raw)); +} +#if HWY_ARCH_ARM_A64 +HWY_API Vec64<double> LowerHalf(Vec128<double> v) { + return Vec64<double>(vget_low_f64(v.raw)); +} +#endif +HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) { + const Full128<uint16_t> du; + const Full64<bfloat16_t> dbh; + return BitCast(dbh, LowerHalf(BitCast(du, v))); +} + +template <class DH> +HWY_API VFromD<DH> LowerHalf(DH /* tag */, VFromD<Twice<DH>> v) { + return LowerHalf(v); +} + +// ------------------------------ CombineShiftRightBytes + +// 128-bit +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec128<T> CombineShiftRightBytes(D d, Vec128<T> hi, Vec128<T> lo) { + static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]"); + const Repartition<uint8_t, decltype(d)> d8; + uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); + return BitCast(d, Vec128<uint8_t>(v8)); +} + +// 64-bit +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec64<T> CombineShiftRightBytes(D d, Vec64<T> hi, Vec64<T> lo) { + static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]"); + const Repartition<uint8_t, decltype(d)> d8; + uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); + return BitCast(d, VFromD<decltype(d8)>(v8)); +} + +// <= 32-bit defined after ShiftLeftBytes. + +// ------------------------------ Shift vector by constant #bytes + +namespace detail { + +// Partially specialize because kBytes = 0 and >= size are compile errors; +// callers replace the latter with 0xFF for easier specialization. +template <int kBytes> +struct ShiftLeftBytesT { + // Full + template <class T> + HWY_INLINE Vec128<T> operator()(const Vec128<T> v) { + const Full128<T> d; + return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d)); + } + + // Partial + template <class T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> + HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { + // Expand to 64-bit so we only use the native EXT instruction. + const Full64<T> d64; + const auto zero64 = Zero(d64); + const decltype(zero64) v64(v.raw); + return Vec128<T, N>( + CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw); + } +}; +template <> +struct ShiftLeftBytesT<0> { + template <class T, size_t N> + HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { + return v; + } +}; +template <> +struct ShiftLeftBytesT<0xFF> { + template <class T, size_t N> + HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { + return Xor(v, v); + } +}; + +template <int kBytes> +struct ShiftRightBytesT { + template <class T, size_t N> + HWY_INLINE Vec128<T, N> operator()(Vec128<T, N> v) { + const DFromV<decltype(v)> d; + // For < 64-bit vectors, zero undefined lanes so we shift in zeros. + if (d.MaxBytes() < 8) { + constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8; + const Simd<T, kReg / sizeof(T), 0> dreg; + v = Vec128<T, N>( + IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw); + } + return CombineShiftRightBytes<kBytes>(d, Zero(d), v); + } +}; +template <> +struct ShiftRightBytesT<0> { + template <class T, size_t N> + HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { + return v; + } +}; +template <> +struct ShiftRightBytesT<0xFF> { + template <class T, size_t N> + HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { + return Xor(v, v); + } +}; + +} // namespace detail + +template <int kBytes, class D> +HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { + return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v); +} + +template <int kBytes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { + return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); +} + +template <int kLanes, class D> +HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v))); +} + +template <int kLanes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) { + return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); +} + +// 0x01..0F, kBytes = 1 => 0x0001..0E +template <int kBytes, class D> +HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { + return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()( + v); +} + +template <int kLanes, class D> +HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast( + d, ShiftRightBytes<kLanes * sizeof(TFromD<D>)>(d8, BitCast(d8, v))); +} + +// Calls ShiftLeftBytes +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kSize = d.MaxBytes(); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition<uint8_t, decltype(d)> d8; + const Full64<uint8_t> d_full8; + const Repartition<TFromD<D>, decltype(d_full8)> d_full; + using V64 = VFromD<decltype(d_full8)>; + const V64 hi64(BitCast(d8, hi).raw); + // Move into most-significant bytes + const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw)); + const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64); + // After casting to full 64-bit vector of correct type, shrink to 32-bit + return VFromD<D>(BitCast(d_full, r).raw); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +// Full input +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> UpperHalf(D /* tag */, Vec128<uint8_t> v) { + return Vec64<uint8_t>(vget_high_u8(v.raw)); +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> UpperHalf(D /* tag */, Vec128<uint16_t> v) { + return Vec64<uint16_t>(vget_high_u16(v.raw)); +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> UpperHalf(D /* tag */, Vec128<uint32_t> v) { + return Vec64<uint32_t>(vget_high_u32(v.raw)); +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec64<uint64_t> UpperHalf(D /* tag */, Vec128<uint64_t> v) { + return Vec64<uint64_t>(vget_high_u64(v.raw)); +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> UpperHalf(D /* tag */, Vec128<int8_t> v) { + return Vec64<int8_t>(vget_high_s8(v.raw)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> UpperHalf(D /* tag */, Vec128<int16_t> v) { + return Vec64<int16_t>(vget_high_s16(v.raw)); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> UpperHalf(D /* tag */, Vec128<int32_t> v) { + return Vec64<int32_t>(vget_high_s32(v.raw)); +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec64<int64_t> UpperHalf(D /* tag */, Vec128<int64_t> v) { + return Vec64<int64_t>(vget_high_s64(v.raw)); +} +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) { + return Vec64<float>(vget_high_f32(v.raw)); +} +#if HWY_ARCH_ARM_A64 +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) { + return Vec64<double>(vget_high_f64(v.raw)); +} +#endif + +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec64<bfloat16_t> UpperHalf(D dbh, Vec128<bfloat16_t> v) { + const RebindToUnsigned<decltype(dbh)> duh; + const Twice<decltype(duh)> du; + return BitCast(dbh, UpperHalf(duh, BitCast(du, v))); +} + +// Partial +template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)> +HWY_API VFromD<DH> UpperHalf(DH dh, VFromD<Twice<DH>> v) { + const Twice<DH> d; + const RebindToUnsigned<decltype(d)> du; + const VFromD<decltype(du)> upper = + ShiftRightBytes<dh.MaxBytes()>(du, BitCast(du, v)); + return VFromD<DH>(BitCast(d, upper).raw); +} + +// ------------------------------ Broadcast/splat any lane + +#if HWY_ARCH_ARM_A64 +// Unsigned +template <int kLane> +HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane)); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane)); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane)); +} +// Vec64<uint64_t> is defined below. + +// Signed +template <int kLane> +HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane)); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane)); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> +HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane)); +} +// Vec64<int64_t> is defined below. + +// Float +template <int kLane> +HWY_API Vec128<float> Broadcast(Vec128<float> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128<float>(vdupq_laneq_f32(v.raw, kLane)); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<float, N>(vdup_lane_f32(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<double> Broadcast(Vec128<double> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128<double>(vdupq_laneq_f64(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec64<double> Broadcast(Vec64<double> v) { + static_assert(0 <= kLane && kLane < 1, "Invalid lane"); + return v; +} + +#else +// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*. + +// Unsigned +template <int kLane> +HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane))); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> +HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane))); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> +HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane))); +} +// Vec64<uint64_t> is defined below. + +// Signed +template <int kLane> +HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane))); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane))); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> +HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane)); +} +template <int kLane> +HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane))); +} +// Vec64<int64_t> is defined below. + +// Float +template <int kLane> +HWY_API Vec128<float> Broadcast(Vec128<float> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane))); +} +template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<float, N>(vdup_lane_f32(v.raw, kLane)); +} + +#endif + +template <int kLane> +HWY_API Vec64<uint64_t> Broadcast(Vec64<uint64_t> v) { + static_assert(0 <= kLane && kLane < 1, "Invalid lane"); + return v; +} +template <int kLane> +HWY_API Vec64<int64_t> Broadcast(Vec64<int64_t> v) { + static_assert(0 <= kLane && kLane < 1, "Invalid lane"); + return v; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template <typename T, size_t N> +struct Indices128 { + typename detail::Raw128<T, N>::type raw; +}; + +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + return Iota(d8, 0); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + return Load(d8, kBroadcastLaneBytes); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + return Load(d8, kBroadcastLaneBytes); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; + return Load(d8, kBroadcastLaneBytes); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + return Zero(d8); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + return Load(d8, kByteOffsets); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + return Load(d8, kByteOffsets); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + return Load(d8, kByteOffsets); +} + +} // namespace detail + +template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( + D d, Vec128<TI, MaxLanes(D())> vec) { + using T = TFromD<D>; + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + HWY_DASSERT(AllTrue( + du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); +#endif + + (void)d; + return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw}; +} + +template <class D, typename TI, + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( + D d, Vec128<TI, MaxLanes(D())> vec) { + using T = TFromD<D>; + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + HWY_DASSERT(AllTrue( + du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); +#endif + + const Repartition<uint8_t, decltype(d)> d8; + using V8 = VFromD<decltype(d8)>; + + // Broadcast each lane index to all bytes of T and shift to bytes + const V8 lane_indices = TableLookupBytes( + BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); + constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); + const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); + const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); + return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw}; +} + +template <class D, typename TI> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> SetTableIndices(D d, + const TI* idx) { + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return BitCast( + d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw}))); +} + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)> +HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, + Indices128<T, N> idx) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; +// TableLookupLanes currently requires table and index vectors to be the same +// size, though a half-length index vector would be sufficient here. +#if HWY_IS_MSAN + const Vec128<T, N> idx_vec{idx.raw}; + const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; +#else + // We only keep LowerHalf of the result, which is valid in idx. + const Indices128<T, N * 2> idx2{idx.raw}; +#endif + return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); +} + +template <typename T> +HWY_API Vec64<T> TwoTablesLookupLanes(Vec64<T> a, Vec64<T> b, + Indices128<T, 8 / sizeof(T)> idx) { + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> du8; + const auto a_u8 = BitCast(du8, a); + const auto b_u8 = BitCast(du8, b); + const auto idx_u8 = BitCast(du8, Vec64<T>{idx.raw}); + +#if HWY_ARCH_ARM_A64 + const Twice<decltype(du8)> dt_u8; + return BitCast( + d, Vec64<uint8_t>{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)}); +#else + detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}}; + return BitCast(d, Vec64<uint8_t>{vtbl2_u8(tup.raw, idx_u8.raw)}); +#endif +} + +template <typename T> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T, 16 / sizeof(T)> idx) { + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> du8; + const auto a_u8 = BitCast(du8, a); + const auto b_u8 = BitCast(du8, b); + const auto idx_u8 = BitCast(du8, Vec128<T>{idx.raw}); + +#if HWY_ARCH_ARM_A64 + detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}}; + return BitCast(d, Vec128<uint8_t>{vqtbl2q_u8(tup.raw, idx_u8.raw)}); +#else + const Half<decltype(d)> dh; + const Repartition<uint8_t, decltype(dh)> dh_u8; + const auto a_lo_u8 = LowerHalf(dh_u8, a_u8); + const auto a_hi_u8 = UpperHalf(dh_u8, a_u8); + const auto b_lo_u8 = LowerHalf(dh_u8, b_u8); + const auto b_hi_u8 = UpperHalf(dh_u8, b_u8); + const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8); + const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8); + + detail::Tuple4<uint8_t, dh_u8.MaxLanes()> tup = { + {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}}; + const auto lo_result = + BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_lo_u8.raw)}); + const auto hi_result = + BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_hi_u8.raw)}); + return Combine(d, hi_result, lo_result); +#endif +} + +// ------------------------------ Reverse2 (CombineShiftRightBytes) + +// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>(vrev16_u8(BitCast(du, v).raw))); +} +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Vec128<uint8_t>(vrev16q_u8(BitCast(du, v).raw))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>(vrev32_u16(BitCast(du, v).raw))); +} +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>(vrev64_u32(BitCast(du, v).raw))); +} +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + return CombineShiftRightBytes<8>(d, v, v); +} + +// ------------------------------ Reverse4 (Reverse2) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>(vrev32_u8(BitCast(du, v).raw))); +} +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Vec128<uint8_t>(vrev32q_u8(BitCast(du, v).raw))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>(vrev64_u16(BitCast(du, v).raw))); +} +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + const RepartitionToWide<RebindToUnsigned<decltype(d)>> duw; + return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v)))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D>) { + HWY_ASSERT(0); // don't have 8 u64 lanes +} + +// ------------------------------ Reverse8 (Reverse2, Reverse4) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>(vrev64_u8(BitCast(du, v).raw))); +} +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> Reverse8(D d, Vec128<T> v) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Vec128<uint8_t>(vrev64q_u8(BitCast(du, v).raw))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { + const Repartition<uint64_t, decltype(d)> du64; + return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v)))); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> Reverse8(D, VFromD<D>) { + HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit +} + +// ------------------------------ Reverse (Reverse2, Reverse4, Reverse8) + +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { + return v; +} + +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> Reverse(D d, Vec128<T, 2> v) { + return Reverse2(d, v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 4)> +HWY_API Vec128<T, 4> Reverse(D d, Vec128<T, 4> v) { + return Reverse4(d, v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 8)> +HWY_API Vec128<T, 8> Reverse(D d, Vec128<T, 8> v) { + return Reverse8(d, v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 16)> +HWY_API Vec128<T> Reverse(D d, Vec128<T> v) { + const Repartition<uint64_t, decltype(d)> du64; + return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v)))); +} + +// ------------------------------ ReverseBits + +#if HWY_ARCH_ARM_A64 + +#ifdef HWY_NATIVE_REVERSE_BITS_UI8 +#undef HWY_NATIVE_REVERSE_BITS_UI8 +#else +#define HWY_NATIVE_REVERSE_BITS_UI8 +#endif + +HWY_NEON_DEF_FUNCTION_INT_8(ReverseBits, vrbit, _, 1) +HWY_NEON_DEF_FUNCTION_UINT_8(ReverseBits, vrbit, _, 1) + +#endif // HWY_ARCH_ARM_A64 + +// ------------------------------ Other shuffles (TableLookupBytes) + +// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 64-bit halves +template <typename T> +HWY_API Vec128<T> Shuffle1032(Vec128<T> v) { + return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v); +} +template <typename T> +HWY_API Vec128<T> Shuffle01(Vec128<T> v) { + return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v); +} + +// Rotate right 32 bits +template <typename T> +HWY_API Vec128<T> Shuffle0321(Vec128<T> v) { + return CombineShiftRightBytes<4>(DFromV<decltype(v)>(), v, v); +} + +// Rotate left 32 bits +template <typename T> +HWY_API Vec128<T> Shuffle2103(Vec128<T> v) { + return CombineShiftRightBytes<12>(DFromV<decltype(v)>(), v, v); +} + +// Reverse +template <typename T> +HWY_API Vec128<T> Shuffle0123(Vec128<T> v) { + return Reverse4(DFromV<decltype(v)>(), v); +} + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). +HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2) + +#if HWY_ARCH_ARM_A64 +// N=1 makes no sense (in that case, there would be no upper/lower). +HWY_API Vec128<uint64_t> InterleaveLower(Vec128<uint64_t> a, + Vec128<uint64_t> b) { + return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw)); +} +HWY_API Vec128<int64_t> InterleaveLower(Vec128<int64_t> a, Vec128<int64_t> b) { + return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw)); +} +HWY_API Vec128<double> InterleaveLower(Vec128<double> a, Vec128<double> b) { + return Vec128<double>(vzip1q_f64(a.raw, b.raw)); +} +#else +// Emulated version for Armv7. +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) { + const DFromV<decltype(a)> d; + return CombineShiftRightBytes<8>(d, b, Shuffle01(a)); +} +#endif + +// Floats +HWY_API Vec128<float> InterleaveLower(Vec128<float> a, Vec128<float> b) { + return Vec128<float>(vzip1q_f32(a.raw, b.raw)); +} +template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> +HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, + Vec128<float, N> b) { + return Vec128<float, N>(vzip1_f32(a.raw, b.raw)); +} + +// < 64 bit parts +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)> +HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw); +} + +// Additional overload for the optional Simd<> tag. +template <class D> +HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { +HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2) + +#if HWY_ARCH_ARM_A64 +// N=1 makes no sense (in that case, there would be no upper/lower). +HWY_API Vec128<uint64_t> InterleaveUpper(Vec128<uint64_t> a, + Vec128<uint64_t> b) { + return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw)); +} +HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) { + return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw)); +} +HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) { + return Vec128<double>(vzip2q_f64(a.raw, b.raw)); +} +#else +// Emulated version for Armv7. +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> InterleaveUpper(Vec128<T> a, Vec128<T> b) { + const DFromV<decltype(a)> d; + return CombineShiftRightBytes<8>(d, Shuffle01(b), a); +} +#endif + +HWY_API Vec128<float> InterleaveUpper(Vec128<float> a, Vec128<float> b) { + return Vec128<float>(vzip2q_f32(a.raw, b.raw)); +} +HWY_API Vec64<float> InterleaveUpper(Vec64<float> a, Vec64<float> b) { + return Vec64<float>(vzip2_f32(a.raw, b.raw)); +} + +} // namespace detail + +// Full register +template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> +HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { + return detail::InterleaveUpper(a, b); +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { + const Half<decltype(d)> d2; + const VFromD<D> a2(UpperHalf(d2, a).raw); + const VFromD<D> b2(UpperHalf(d2, b).raw); + return InterleaveLower(d, a2, b2); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, + const VFromD<D32> sum0, + VFromD<D32>& sum1) { + const RebindToUnsigned<decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a, + Vec128<int16_t> b, + const Vec128<int32_t> sum0, + Vec128<int32_t>& sum1) { +#if HWY_ARCH_ARM_A64 + sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw)); +#else + const Full64<int16_t> dh; + sum1 = Vec128<int32_t>( + vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); +#endif + return Vec128<int32_t>( + vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(D d32, Vec64<int16_t> a, + Vec64<int16_t> b, + const Vec64<int32_t> sum0, + Vec64<int32_t>& sum1) { + // vmlal writes into the upper half, which the caller cannot use, so + // split into two halves. + const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw)); + const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210); + sum1 += mul_32; + return sum0 + LowerHalf(mul_3210); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(D d32, Vec32<int16_t> a, + Vec32<int16_t> b, + const Vec32<int32_t> sum0, + Vec32<int32_t>& sum1) { + const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw)); + const Vec64<int32_t> mul_10(LowerHalf(mul_xx10)); + const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10); + const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10); + sum1 += mul1; + return sum0 + mul0; +} + +// ------------------------------ Combine partial (InterleaveLower) +// < 64bit input, <= 64 bit result +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) { + // First double N (only lower halves will be used). + const VFromD<D> hi2(hi.raw); + const VFromD<D> lo2(lo.raw); + // Repartition to two unsigned lanes (each the size of the valid input). + const Simd<UnsignedFromSize<d.MaxBytes() / 2>, 2, 0> du; + return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2))); +} + +// ------------------------------ RearrangeToOddPlusEven (Combine) + +template <size_t N> +HWY_API Vec128<float, N> RearrangeToOddPlusEven(Vec128<float, N> sum0, + Vec128<float, N> sum1) { + return Add(sum0, sum1); +} + +HWY_API Vec128<int32_t> RearrangeToOddPlusEven(Vec128<int32_t> sum0, + Vec128<int32_t> sum1) { +// vmlal_s16 multiplied the lower half into sum0 and upper into sum1. +#if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want + return Vec128<int32_t>(vpaddq_s32(sum0.raw, sum1.raw)); +#else + const Full128<int32_t> d; + const Half<decltype(d)> d64; + const Vec64<int32_t> hi( + vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw)); + const Vec64<int32_t> lo( + vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw)); + return Combine(Full128<int32_t>(), hi, lo); +#endif +} + +HWY_API Vec64<int32_t> RearrangeToOddPlusEven(Vec64<int32_t> sum0, + Vec64<int32_t> sum1) { + // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. + return Vec64<int32_t>(vpadd_s32(sum0.raw, sum1.raw)); +} + +HWY_API Vec32<int32_t> RearrangeToOddPlusEven(Vec32<int32_t> sum0, + Vec32<int32_t> sum1) { + // Only one widened sum per register, so add them for sum of odd and even. + return sum0 + sum1; +} + +// ------------------------------ WidenMulPairwiseAdd + +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { + const RebindToUnsigned<decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), + Mul(BitCast(df32, ao), BitCast(df32, bo))); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a, + Vec128<int16_t> b) { + Vec128<int32_t> sum1; +#if HWY_ARCH_ARM_A64 + sum1 = Vec128<int32_t>(vmull_high_s16(a.raw, b.raw)); +#else + const Full64<int16_t> dh; + sum1 = Vec128<int32_t>(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); +#endif + Vec128<int32_t> sum0 = Vec128<int32_t>(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw)); + return RearrangeToOddPlusEven(sum0, sum1); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> WidenMulPairwiseAdd(D d32, Vec64<int16_t> a, + Vec64<int16_t> b) { + // vmlal writes into the upper half, which the caller cannot use, so + // split into two halves. + const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw)); + const Vec64<int32_t> mul0 = LowerHalf(mul_3210); + const Vec64<int32_t> mul1 = UpperHalf(d32, mul_3210); + return RearrangeToOddPlusEven(mul0, mul1); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec32<int32_t> WidenMulPairwiseAdd(D d32, Vec32<int16_t> a, + Vec32<int16_t> b) { + const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw)); + const Vec64<int32_t> mul_10(LowerHalf(mul_xx10)); + const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10); + const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10); + return RearrangeToOddPlusEven(mul0, mul1); +} + + +// ------------------------------ ZeroExtendVector (Combine) + +template <class D> +HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { + return Combine(d, Zero(Half<decltype(d)>()), lo); +} + +// ------------------------------ ConcatLowerLower + +// 64 or 128-bit input: just interleave +template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> +HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { + // Treat half-width input as a single lane and interleave them. + const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; + return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi))); +} + +namespace detail { +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2) +HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2) +#else + +// vtrn returns a struct with even and odd result. +#define HWY_NEON_BUILD_TPL_HWY_TRN +#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t +// Pass raw args so we can accept uint16x2 args, for which there is no +// corresponding uint16x2x2 return type. +#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \ + Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b +#define HWY_NEON_BUILD_ARG_HWY_TRN a, b + +// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined +// for full and half vectors. +HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN) +#endif +} // namespace detail + +// <= 32-bit input/output +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { + // Treat half-width input as two lanes and take every second one. + const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; +#if HWY_ARCH_ARM_A64 + return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi))); +#else + using VU = VFromD<decltype(du)>; + return BitCast( + d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) + .val[0])); +#endif +} + +// ------------------------------ ConcatUpperUpper + +// 64 or 128-bit input: just interleave +template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> +HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { + // Treat half-width input as a single lane and interleave them. + const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; + return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi))); +} + +// <= 32-bit input/output +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { + // Treat half-width input as two lanes and take every second one. + const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; +#if HWY_ARCH_ARM_A64 + return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi))); +#else + using VU = VFromD<decltype(du)>; + return BitCast( + d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) + .val[1])); +#endif +} + +// ------------------------------ ConcatLowerUpper (ShiftLeftBytes) + +// 64 or 128-bit input: extract from concatenated +template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> +HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { + return CombineShiftRightBytes<d.MaxBytes() / 2>(d, hi, lo); +} + +// <= 32-bit input/output +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kSize = d.MaxBytes(); + const Repartition<uint8_t, decltype(d)> d8; + const Full64<uint8_t> d8x8; + const Full64<TFromD<D>> d64; + using V8x8 = VFromD<decltype(d8x8)>; + const V8x8 hi8x8(BitCast(d8, hi).raw); + // Move into most-significant bytes + const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw)); + const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8); + // Back to original lane type, then shrink N. + return VFromD<D>(BitCast(d64, r).raw); +} + +// ------------------------------ ConcatUpperLower + +// Works for all N. +template <class D> +HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ ConcatOdd (InterleaveUpper) + +namespace detail { +// There is no vuzpq_u64. +HWY_NEON_DEF_FUNCTION_UIF81632(ConcatEven, vuzp1, _, 2) +HWY_NEON_DEF_FUNCTION_UIF81632(ConcatOdd, vuzp2, _, 2) +} // namespace detail + +// Full/half vector +template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> +HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { + return detail::ConcatOdd(lo, hi); +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatOdd(D d, Vec32<T> hi, Vec32<T> lo) { + const Twice<decltype(d)> d2; + const Repartition<uint16_t, decltype(d2)> dw2; + const VFromD<decltype(d2)> hi2(hi.raw); + const VFromD<decltype(d2)> lo2(lo.raw); + const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2)); + // Compact into two pairs of u8, skipping the invalid x lanes. Could also use + // vcopy_lane_u16, but that's A64-only. + return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw); +} + +// Any type x2 +template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>> +HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// Full/half vector +template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> +HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { + return detail::ConcatEven(lo, hi); +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatEven(D d, Vec32<T> hi, Vec32<T> lo) { + const Twice<decltype(d)> d2; + const Repartition<uint16_t, decltype(d2)> dw2; + const VFromD<decltype(d2)> hi2(hi.raw); + const VFromD<decltype(d2)> lo2(lo.raw); + const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2)); + // Compact into two pairs of u8, skipping the invalid x lanes. Could also use + // vcopy_lane_u16, but that's A64-only. + return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw); +} + +// Any type x2 +template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>> +HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ DupEven (InterleaveLower) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { +#if HWY_ARCH_ARM_A64 + return detail::InterleaveEven(v, v); +#else + return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]); +#endif +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { + return InterleaveLower(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { +#if HWY_ARCH_ARM_A64 + return detail::InterleaveOdd(v, v); +#else + return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]); +#endif +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { + return InterleaveUpper(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ OddEven (IfThenElse) + +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBytes[16] = { + ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF, + ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF, + ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF, + ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF, + ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF, + ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF, + ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF, + ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF, + }; + const auto vec = BitCast(d, Load(d8, kBytes)); + return IfThenElse(MaskFromVec(vec), b, a); +} + +// ------------------------------ OddEvenBlocks +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks +template <typename T, size_t N> +HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { + return v; +} + +// ------------------------------ ReverseBlocks +// Single block: no change +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { + return v; +} + +// ------------------------------ ReorderDemote2To (OddEven) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D), + class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) { + const RebindToUnsigned<decltype(dbf16)> du16; + return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a, + Vec128<int64_t> b) { + const Vec64<int32_t> a32(vqmovn_s64(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d32; + return Vec128<int32_t>(vqmovn_high_s64(a32.raw, b.raw)); +#else + const Vec64<int32_t> b32(vqmovn_s64(b.raw)); + return Combine(d32, b32, a32); +#endif +} + +template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a, + VFromD<Repartition<int64_t, D>> b) { + const Rebind<int64_t, decltype(d32)> dt; + return DemoteTo(d32, Combine(dt, b, a)); +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<int64_t> a, + Vec128<int64_t> b) { + const Vec64<uint32_t> a32(vqmovun_s64(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d32; + return Vec128<uint32_t>(vqmovun_high_s64(a32.raw, b.raw)); +#else + const Vec64<uint32_t> b32(vqmovun_s64(b.raw)); + return Combine(d32, b32, a32); +#endif +} + +template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a, + VFromD<Repartition<int64_t, D>> b) { + const Rebind<int64_t, decltype(d32)> dt; + return DemoteTo(d32, Combine(dt, b, a)); +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const Vec64<uint32_t> a32(vqmovn_u64(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d32; + return Vec128<uint32_t>(vqmovn_high_u64(a32.raw, b.raw)); +#else + const Vec64<uint32_t> b32(vqmovn_u64(b.raw)); + return Combine(d32, b32, a32); +#endif +} + +template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<uint64_t, D>> a, + VFromD<Repartition<uint64_t, D>> b) { + const Rebind<uint64_t, decltype(d32)> dt; + return DemoteTo(d32, Combine(dt, b, a)); +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> ReorderDemote2To(D d16, Vec128<int32_t> a, + Vec128<int32_t> b) { + const Vec64<int16_t> a16(vqmovn_s32(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d16; + return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw)); +#else + const Vec64<int16_t> b16(vqmovn_s32(b.raw)); + return Combine(d16, b16, a16); +#endif +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a, + Vec64<int32_t> b) { + const Full128<int32_t> d32; + const Vec128<int32_t> ab = Combine(d32, b, a); + return Vec64<int16_t>(vqmovn_s32(ab.raw)); +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec32<int16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a, + Vec32<int32_t> b) { + const Full128<int32_t> d32; + const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw)); + return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw)); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<int32_t> a, + Vec128<int32_t> b) { + const Vec64<uint16_t> a16(vqmovun_s32(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d16; + return Vec128<uint16_t>(vqmovun_high_s32(a16.raw, b.raw)); +#else + const Vec64<uint16_t> b16(vqmovun_s32(b.raw)); + return Combine(d16, b16, a16); +#endif +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a, + Vec64<int32_t> b) { + const Full128<int32_t> d32; + const Vec128<int32_t> ab = Combine(d32, b, a); + return Vec64<uint16_t>(vqmovun_s32(ab.raw)); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a, + Vec32<int32_t> b) { + const Full128<int32_t> d32; + const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw)); + return Vec32<uint16_t>(vqmovun_s32(Combine(d32, ab, ab).raw)); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<uint32_t> a, + Vec128<uint32_t> b) { + const Vec64<uint16_t> a16(vqmovn_u32(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d16; + return Vec128<uint16_t>(vqmovn_high_u32(a16.raw, b.raw)); +#else + const Vec64<uint16_t> b16(vqmovn_u32(b.raw)); + return Combine(d16, b16, a16); +#endif +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<uint32_t> a, + Vec64<uint32_t> b) { + const Full128<uint32_t> d32; + const Vec128<uint32_t> ab = Combine(d32, b, a); + return Vec64<uint16_t>(vqmovn_u32(ab.raw)); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<uint32_t> a, + Vec32<uint32_t> b) { + const Full128<uint32_t> d32; + const Vec64<uint32_t> ab(vzip1_u32(a.raw, b.raw)); + return Vec32<uint16_t>(vqmovn_u32(Combine(d32, ab, ab).raw)); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> ReorderDemote2To(D d8, Vec128<int16_t> a, + Vec128<int16_t> b) { + const Vec64<int8_t> a8(vqmovn_s16(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d8; + return Vec128<int8_t>(vqmovn_high_s16(a8.raw, b.raw)); +#else + const Vec64<int8_t> b8(vqmovn_s16(b.raw)); + return Combine(d8, b8, a8); +#endif +} + +template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a, + VFromD<Repartition<int16_t, D>> b) { + const Rebind<int16_t, decltype(d8)> dt; + return DemoteTo(d8, Combine(dt, b, a)); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<int16_t> a, + Vec128<int16_t> b) { + const Vec64<uint8_t> a8(vqmovun_s16(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d8; + return Vec128<uint8_t>(vqmovun_high_s16(a8.raw, b.raw)); +#else + const Vec64<uint8_t> b8(vqmovun_s16(b.raw)); + return Combine(d8, b8, a8); +#endif +} + +template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a, + VFromD<Repartition<int16_t, D>> b) { + const Rebind<int16_t, decltype(d8)> dt; + return DemoteTo(d8, Combine(dt, b, a)); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<uint16_t> a, + Vec128<uint16_t> b) { + const Vec64<uint8_t> a8(vqmovn_u16(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d8; + return Vec128<uint8_t>(vqmovn_high_u16(a8.raw, b.raw)); +#else + const Vec64<uint8_t> b8(vqmovn_u16(b.raw)); + return Combine(d8, b8, a8); +#endif +} + +template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<uint16_t, D>> a, + VFromD<Repartition<uint16_t, D>> b) { + const Rebind<uint16_t, decltype(d8)> dt; + return DemoteTo(d8, Combine(dt, b, a)); +} + +template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + return ReorderDemote2To(d, a, b); +} + +template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) { + return ReorderDemote2To(dbf16, a, b); +} + +// ================================================== CRYPTO + +// (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH). +// Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*. +#if HWY_TARGET == HWY_NEON + +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + // NOTE: it is important that AESE and AESMC be consecutive instructions so + // they can be fused. AESE includes AddRoundKey, which is a different ordering + // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual + // round key (the compiler will hopefully optimize this for multiple rounds). + return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^ + round_key; +} + +HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; +} + +HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { + return Vec128<uint8_t>{vaesimcq_u8(state.raw)}; +} + +HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + // NOTE: it is important that AESD and AESIMC be consecutive instructions so + // they can be fused. AESD includes AddRoundKey, which is a different ordering + // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual + // round key (the compiler will hopefully optimize this for multiple rounds). + return Vec128<uint8_t>(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^ + round_key; +} + +HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + return Vec128<uint8_t>(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; +} + +HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) { + return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b))); +} + +HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) { + return Vec128<uint64_t>( + (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw)); +} + +#endif // HWY_TARGET == HWY_NEON + +// ================================================== MISC + +template <class D, HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ------------------------------ Truncations + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, + HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo), + hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr> +HWY_API Vec128<TTo, 1> TruncateTo(DTo /* tag */, Vec128<TFrom, 1> v) { + const Repartition<TTo, DFromV<decltype(v)>> d; + return Vec128<TTo, 1>{BitCast(d, v).raw}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + const auto v3 = detail::ConcatEven(v2, v2); + const auto v4 = detail::ConcatEven(v3, v3); + return LowerHalf(LowerHalf(LowerHalf(v4))); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + const Repartition<uint16_t, DFromV<decltype(v)>> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + const auto v3 = detail::ConcatEven(v2, v2); + return LowerHalf(LowerHalf(v3)); +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + const Repartition<uint32_t, DFromV<decltype(v)>> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + return LowerHalf(v2); +} + +template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + const auto v3 = detail::ConcatEven(v2, v2); + return LowerHalf(LowerHalf(v3)); +} + +template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 1)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const Repartition<uint16_t, DFromV<decltype(v)>> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + return LowerHalf(v2); +} + +template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + return LowerHalf(v2); +} + +// ------------------------------ MulEven (ConcatEven) + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +HWY_API Vec128<int64_t> MulEven(Vec128<int32_t> a, Vec128<int32_t> b) { + const DFromV<decltype(a)> d; + int32x4_t a_packed = ConcatEven(d, a, a).raw; + int32x4_t b_packed = ConcatEven(d, b, b).raw; + return Vec128<int64_t>( + vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); +} +HWY_API Vec128<uint64_t> MulEven(Vec128<uint32_t> a, Vec128<uint32_t> b) { + const DFromV<decltype(a)> d; + uint32x4_t a_packed = ConcatEven(d, a, a).raw; + uint32x4_t b_packed = ConcatEven(d, b, b).raw; + return Vec128<uint64_t>( + vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); +} + +template <size_t N> +HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + const DFromV<decltype(a)> d; + int32x2_t a_packed = ConcatEven(d, a, a).raw; + int32x2_t b_packed = ConcatEven(d, b, b).raw; + return Vec128<int64_t, (N + 1) / 2>( + vget_low_s64(vmull_s32(a_packed, b_packed))); +} +template <size_t N> +HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + const DFromV<decltype(a)> d; + uint32x2_t a_packed = ConcatEven(d, a, a).raw; + uint32x2_t b_packed = ConcatEven(d, b, b).raw; + return Vec128<uint64_t, (N + 1) / 2>( + vget_low_u64(vmull_u32(a_packed, b_packed))); +} + +HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) { + uint64_t hi; + uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi); + return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); +} + +HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) { + uint64_t hi; + uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi); + return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); +} + +// ------------------------------ TableLookupBytes (Combine, LowerHalf) + +// Both full +template <typename T, typename TI> +HWY_API Vec128<TI> TableLookupBytes(Vec128<T> bytes, Vec128<TI> from) { + const DFromV<decltype(from)> d; + const Repartition<uint8_t, decltype(d)> d8; +#if HWY_ARCH_ARM_A64 + return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw, + BitCast(d8, from).raw))); +#else + uint8x16_t table0 = BitCast(d8, bytes).raw; + uint8x8x2_t table; + table.val[0] = vget_low_u8(table0); + table.val[1] = vget_high_u8(table0); + uint8x16_t idx = BitCast(d8, from).raw; + uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); + return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi))); +#endif +} + +// Partial index vector +template <typename T, typename TI, size_t NI, HWY_IF_V_SIZE_LE(TI, NI, 8)> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T> bytes, Vec128<TI, NI> from) { + const Full128<TI> d_full; + const Vec64<TI> from64(from.raw); + const auto idx_full = Combine(d_full, from64, from64); + const auto out_full = TableLookupBytes(bytes, idx_full); + return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw); +} + +// Partial table vector +template <typename T, size_t N, typename TI, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_API Vec128<TI> TableLookupBytes(Vec128<T, N> bytes, Vec128<TI> from) { + const Full128<T> d_full; + return TableLookupBytes(Combine(d_full, bytes, bytes), from); +} + +// Partial both +template <typename T, size_t N, typename TI, size_t NI, + HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_V_SIZE_LE(TI, NI, 8)> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes, + Vec128<TI, NI> from) { + const DFromV<decltype(bytes)> d; + const Simd<TI, NI, 0> d_idx; + const Repartition<uint8_t, decltype(d_idx)> d_idx8; + // uint8x8 + const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes); + const auto from8 = BitCast(d_idx8, from); + const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw)); + return BitCast(d_idx, v8); +} + +// For all vector widths; Arm anyway zeroes if >= 0x10. +template <class V, class VI> +HWY_API VI TableLookupBytesOr0(V bytes, VI from) { + return TableLookupBytes(bytes, from); +} + +// ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes) + +#if HWY_TARGET == HWY_NEON +template <uint8_t kRcon> +HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { + alignas(16) static constexpr uint8_t kRconXorMask[16] = { + 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; + alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { + 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; + const DFromV<decltype(v)> d; + const Repartition<uint32_t, decltype(d)> du32; + const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); + const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask)); + return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle)); +} +#endif // HWY_TARGET == HWY_NEON + +// ------------------------------ Scatter (Store) + +template <class D, typename T = TFromD<D>, class VI> +HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + HWY_ALIGN TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template <class D, typename T = TFromD<D>, class VI> +HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + HWY_ALIGN TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + for (size_t i = 0; i < MaxLanes(d); ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) + +template <class D, typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + HWY_ALIGN T lanes[MaxLanes(d)]; + const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template <class D, typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + HWY_ALIGN T lanes[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template <typename T> +HWY_INLINE T ReduceMin(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) { + return GetLane(v); +} +template <typename T> +HWY_INLINE T ReduceMax(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) { + return GetLane(v); +} +template <typename T> +HWY_INLINE T ReduceSum(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) { + return GetLane(v); +} +template <typename T> +HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, + Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, + Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, + Vec128<T, 1> v) { + return v; +} + +// full vectors +#if HWY_ARCH_ARM_A64 + +#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \ + HWY_API type##_t name(hwy::SizeTag<sizeof(type##_t)>, \ + Vec128<type##_t, size> v) { \ + return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \ + } + +// Note that u64/s64 don't have horizontal min/max for some reason. +#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \ + HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8) \ + HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8) \ + HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16) \ + HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16) \ + HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32) \ + HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32) \ + HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8) \ + HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8) \ + HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16) \ + HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16) \ + HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32) \ + HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32) \ + HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32) \ + HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \ + HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64) + +HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv) +HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv) + +// u64/s64 don't have horizontal min/max for some reason, but do have add. +#define HWY_NEON_DEF_REDUCTION_ALL_TYPES(name, prefix) \ + HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \ + HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \ + HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64) + +HWY_NEON_DEF_REDUCTION_ALL_TYPES(ReduceSum, vaddv) + +#undef HWY_NEON_DEF_REDUCTION_ALL_TYPES +#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES +#undef HWY_NEON_DEF_REDUCTION + +// Need some fallback implementations for [ui]64x2 and [ui]16x2. +#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2) +#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2)) + +// Implement Min/Max/SumOfLanes in terms of the corresponding reduction. +template <size_t N, typename V> +HWY_API V MinOfLanes(hwy::SizeTag<N> tag, V v) { + return Set(DFromV<decltype(v)>(), ReduceMin(tag, v)); +} +template <size_t N, typename V> +HWY_API V MaxOfLanes(hwy::SizeTag<N> tag, V v) { + return Set(DFromV<decltype(v)>(), ReduceMax(tag, v)); +} +template <size_t N, typename V> +HWY_API V SumOfLanes(hwy::SizeTag<N> tag, V v) { + return Set(DFromV<decltype(v)>(), ReduceSum(tag, v)); +} + +#else + +// For arm7, we implement reductions using a series of pairwise operations. This +// produces the full vector result, so we express Reduce* in terms of *OfLanes. +#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t +#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size> +#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \ + HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \ + hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \ + HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \ + if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \ + if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \ + return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size)(tmp); \ + } \ + HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)> tag, \ + Vec128<type##_t, size> v) { \ + return GetLane(name##OfLanes(tag, v)); \ + } + +// For the wide versions, the pairwise operations produce a half-length vector. +// We produce that value with a Reduce*Vector helper method, and express Reduce* +// and *OfLanes in terms of the helper. +#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \ + suffix) \ + HWY_API HWY_NEON_BUILD_TYPE_T(type, half) \ + Reduce##name##Vector(Vec128<type##_t, size> v) { \ + HWY_NEON_BUILD_TYPE_T(type, half) tmp; \ + tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \ + vget_low_##suffix(v.raw)); \ + if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \ + if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \ + if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \ + return tmp; \ + } \ + HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)>, \ + Vec128<type##_t, size> v) { \ + const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \ + return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0); \ + } \ + HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \ + hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \ + const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \ + return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \ + type, size)(vcombine_##suffix(tmp, tmp)); \ + } + +#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \ + HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \ + HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32) + +HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Sum, vpadd) +HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Min, vpmin) +HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax) + +#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS +#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION +#undef HWY_NEON_DEF_PAIRWISE_REDUCTION +#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION +#undef HWY_NEON_BUILD_TYPE_T + +// Need fallback min/max implementations for [ui]64x2 and [ui]16x2. +#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8) +#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8) + +#endif + +} // namespace detail + +// [ui]16/[ui]64: N=2 -- special case for pairs of very small or large lanes +template <class D, typename T, HWY_IF_SUM_REDUCTION(T)> +HWY_API Vec128<T, 2> SumOfLanes(D /* tag */, Vec128<T, 2> v10) { + return v10 + Reverse2(Simd<T, 2, 0>(), v10); +} + +template <class D, typename T, HWY_IF_SUM_REDUCTION(T)> +HWY_API T ReduceSum(D d, Vec128<T, 2> v10) { + return GetLane(SumOfLanes(d, v10)); +} + +template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)> +HWY_API Vec128<T, 2> MinOfLanes(D /* tag */, Vec128<T, 2> v10) { + return Min(v10, Reverse2(Simd<T, 2, 0>(), v10)); +} +template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)> +HWY_API Vec128<T, 2> MaxOfLanes(D /* tag */, Vec128<T, 2> v10) { + return Max(v10, Reverse2(Simd<T, 2, 0>(), v10)); +} + +#undef HWY_IF_SUM_REDUCTION +#undef HWY_IF_MINMAX_REDUCTION + +template <class D> +HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) { + return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} +template <class D> +HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) { + return detail::ReduceSum(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} +template <class D> +HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) { + return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} +template <class D> +HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) { + return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +// Helper function to set 64 bits and potentially return a smaller vector. The +// overload is required to call the q vs non-q intrinsics. Note that 8-bit +// LoadMaskBits only requires 16 bits, but 64 avoids casting. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_INLINE VFromD<D> Set64(D /* tag */, uint64_t mask_bits) { + const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits)); + return VFromD<D>(BitCast(Full64<TFromD<D>>(), v64).raw); +} +template <typename T> +HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) { + return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, N=1. + const auto vmask_bits = Set64(du, mask_bits); + + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8)); + + alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits); + return detail::LoadMaskBits(d, mask_bits); +} + +// ------------------------------ Mask + +namespace detail { + +// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than +// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse. +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) { + const Full128<uint16_t> du16; + const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask)); + const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4)); + return GetLane(BitCast(Full64<uint64_t>(), nib)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 8)> +HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) { + // There is no vshrn_n_u16 for uint16x4, so zero-extend. + const Twice<decltype(d)> d2; + const VFromD<decltype(d2)> v128 = ZeroExtendVector(d2, VecFromMask(d, mask)); + // No need to mask, upper half is zero thanks to ZeroExtendVector. + return NibblesFromMask(d2, MaskFromVec(v128)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) { + const Mask64<TFromD<D>> mask64(mask.raw); + const uint64_t nib = NibblesFromMask(Full64<TFromD<D>>(), mask64); + // Clear nibbles from upper half of 64-bits + return nib & ((1ull << (d.MaxBytes() * 4)) - 1); +} + +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) { + alignas(16) static constexpr uint8_t kSliceLanes[16] = { + 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, + }; + const Full128<uint8_t> du; + const Vec128<uint8_t> values = + BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes); + +#if HWY_ARCH_ARM_A64 + // Can't vaddv - we need two separate bytes (16 bits). + const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); + const uint8x8_t x4 = vpadd_u8(x2, x2); + const uint8x8_t x8 = vpadd_u8(x4, x4); + return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF; +#else + // Don't have vpaddq, so keep doubling lane size. + const uint16x8_t x2 = vpaddlq_u8(values.raw); + const uint32x4_t x4 = vpaddlq_u16(x2); + const uint64x2_t x8 = vpaddlq_u32(x4); + return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); +#endif +} + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) { + // Upper lanes of partial loads are undefined. OnlyActive will fix this if + // we load all kSliceLanes so the upper lanes do not pollute the valid bits. + alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, + 0x10, 0x20, 0x40, 0x80}; + const DFromM<decltype(mask)> d; + const RebindToUnsigned<decltype(d)> du; + const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw); + const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice; + +#if HWY_ARCH_ARM_A64 + return vaddv_u8(values.raw); +#else + const uint16x4_t x2 = vpaddl_u8(values.raw); + const uint32x2_t x4 = vpaddl_u16(x2); + const uint64x1_t x8 = vpaddl_u32(x4); + return vget_lane_u64(x8, 0); +#endif +} + +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) { + alignas(16) static constexpr uint16_t kSliceLanes[8] = { + 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; + const Full128<T> d; + const Full128<uint16_t> du; + const Vec128<uint16_t> values = + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); +#if HWY_ARCH_ARM_A64 + return vaddvq_u16(values.raw); +#else + const uint32x4_t x2 = vpaddlq_u16(values.raw); + const uint64x2_t x4 = vpaddlq_u32(x2); + return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); +#endif +} + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) { + // Upper lanes of partial loads are undefined. OnlyActive will fix this if + // we load all kSliceLanes so the upper lanes do not pollute the valid bits. + alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; + const DFromM<decltype(mask)> d; + const RebindToUnsigned<decltype(d)> du; + const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw); + const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice; +#if HWY_ARCH_ARM_A64 + return vaddv_u16(values.raw); +#else + const uint32x2_t x2 = vpaddl_u16(values.raw); + const uint64x1_t x4 = vpaddl_u32(x2); + return vget_lane_u64(x4, 0); +#endif +} + +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) { + alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; + const Full128<T> d; + const Full128<uint32_t> du; + const Vec128<uint32_t> values = + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); +#if HWY_ARCH_ARM_A64 + return vaddvq_u32(values.raw); +#else + const uint64x2_t x2 = vpaddlq_u32(values.raw); + return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); +#endif +} + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) { + // Upper lanes of partial loads are undefined. OnlyActive will fix this if + // we load all kSliceLanes so the upper lanes do not pollute the valid bits. + alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; + const DFromM<decltype(mask)> d; + const RebindToUnsigned<decltype(d)> du; + const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw); + const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice; +#if HWY_ARCH_ARM_A64 + return vaddv_u32(values.raw); +#else + const uint64x1_t x2 = vpaddl_u32(values.raw); + return vget_lane_u64(x2, 0); +#endif +} + +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T> m) { + alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; + const Full128<T> d; + const Full128<uint64_t> du; + const Vec128<uint64_t> values = + BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); +#if HWY_ARCH_ARM_A64 + return vaddvq_u64(values.raw); +#else + return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); +#endif +} + +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, 1> m) { + const Full64<T> d; + const Full64<uint64_t> du; + const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); + return vget_lane_u64(values.raw, 0); +} + +// Returns the lowest N for the BitsFromMask result. +template <typename T, size_t N> +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) { + return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask)); +} + +// Returns number of lanes whose mask is set. +// +// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op +// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also +// changes each lane to 1 (if mask set) or 0. +// NOTE: PopCount also operates on vectors, so we still have to do horizontal +// sums separately. We specialize CountTrue for full vectors (negating instead +// of PopCount because it avoids an extra shift), and use PopCount of +// NibblesFromMask for partial vectors. + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) { + const Full128<int8_t> di; + const int8x16_t ones = + vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); + +#if HWY_ARCH_ARM_A64 + return static_cast<size_t>(vaddvq_s8(ones)); +#else + const int16x8_t x2 = vpaddlq_s8(ones); + const int32x4_t x4 = vpaddlq_s16(x2); + const int64x2_t x8 = vpaddlq_s32(x4); + return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1)); +#endif +} +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) { + const Full128<int16_t> di; + const int16x8_t ones = + vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); + +#if HWY_ARCH_ARM_A64 + return static_cast<size_t>(vaddvq_s16(ones)); +#else + const int32x4_t x2 = vpaddlq_s16(ones); + const int64x2_t x4 = vpaddlq_s32(x2); + return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1)); +#endif +} + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) { + const Full128<int32_t> di; + const int32x4_t ones = + vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); + +#if HWY_ARCH_ARM_A64 + return static_cast<size_t>(vaddvq_s32(ones)); +#else + const int64x2_t x2 = vpaddlq_s32(ones); + return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1)); +#endif +} + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, Mask128<T> mask) { +#if HWY_ARCH_ARM_A64 + const Full128<int64_t> di; + const int64x2_t ones = + vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); + return static_cast<size_t>(vaddvq_s64(ones)); +#else + const Full128<uint64_t> du; + const auto mask_u = VecFromMask(du, RebindMask(du, mask)); + const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); + return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1)); +#endif +} + +} // namespace detail + +// Full +template <class D, typename T = TFromD<D>> +HWY_API size_t CountTrue(D /* tag */, Mask128<T> mask) { + return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask); +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API size_t CountTrue(D d, MFromD<D> mask) { + constexpr int kDiv = 4 * sizeof(TFromD<D>); + return PopCount(detail::NibblesFromMask(d, mask)) / kDiv; +} + +template <class D> +HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { + const uint64_t nib = detail::NibblesFromMask(d, mask); + constexpr size_t kDiv = 4 * sizeof(TFromD<D>); + return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv; +} + +template <class D> +HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { + const uint64_t nib = detail::NibblesFromMask(d, mask); + if (nib == 0) return -1; + constexpr size_t kDiv = 4 * sizeof(TFromD<D>); + return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv); +} + +template <class D> +HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { + const uint64_t nib = detail::NibblesFromMask(d, mask); + constexpr size_t kDiv = 4 * sizeof(TFromD<D>); + return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv; +} + +template <class D> +HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { + const uint64_t nib = detail::NibblesFromMask(d, mask); + if (nib == 0) return -1; + constexpr size_t kDiv = 4 * sizeof(TFromD<D>); + return static_cast<intptr_t>((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / + kDiv); +} + +// `p` points to at least 8 writable bytes. +template <class D> +HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const size_t kNumBytes = (d.MaxLanes() + 7) / 8; + CopyBytes<kNumBytes>(&mask_bits, bits); + return kNumBytes; +} + +template <class D> +HWY_API bool AllFalse(D d, MFromD<D> m) { + return detail::NibblesFromMask(d, m) == 0; +} + +// Full +template <class D, typename T = TFromD<D>> +HWY_API bool AllTrue(D d, Mask128<T> m) { + return detail::NibblesFromMask(d, m) == ~0ull; +} +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API bool AllTrue(D d, MFromD<D> m) { + return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1; +} + +// ------------------------------ Compress + +template <typename T> +struct CompressIsPartition { + enum { value = (sizeof(T) != 1) }; +}; + +namespace detail { + +// Load 8 bytes, replicate into upper half so ZipLower can use the lower half. +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) { + return Vec128<uint8_t>(vreinterpretq_u8_u64( + vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes)))); +} + +// Load 8 bytes and return half-reg with N <= 8 bytes. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_INLINE VFromD<D> Load8Bytes(D d, const uint8_t* bytes) { + return Load(d, bytes); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + const Simd<uint16_t, N, 0> du; + + // NEON does not provide an equivalent of AVX2 permutevar, so we need byte + // indices for VTBL (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) static constexpr uint8_t table[256 * 8] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8); + const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + const Simd<uint16_t, N, 0> du; + + // NEON does not provide an equivalent of AVX2 permutevar, so we need byte + // indices for VTBL (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) static constexpr uint8_t table[256 * 8] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8); + const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[64] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +#endif + +// Helper function called by both Compress and CompressStore - avoids a +// redundant BitsFromMask in the latter. +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, uint64_t mask_bits) { + const auto idx = + detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits); + using D = DFromV<decltype(v)>; + const RebindToSigned<D> di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, uint64_t mask_bits) { + const auto idx = + detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits); + using D = DFromV<decltype(v)>; + const RebindToSigned<D> di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +} // namespace detail + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const DFromV<decltype(v)> d; + const Vec128<T, N> m = VecFromMask(d, mask); + const Vec128<T, N> maskL = DupEven(m); + const Vec128<T, N> maskH = DupOdd(m); + const Vec128<T, N> swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 byte lanes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { + return detail::Compress(v, detail::BitsFromMask(mask)); +} + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const DFromV<decltype(v)> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 byte lanes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::Compress(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNot(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, + Mask128<uint64_t> /* m */) { + return v; +} + +// ------------------------------ CompressBits + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(v, mask_bits); +} + +// ------------------------------ CompressStore +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + StoreU(detail::Compress(v, mask_bits), d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ CompressBlendedStore +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16 + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const MFromD<D> store_mask = RebindMask(d, FirstN(du, count)); + const VFromD<decltype(du)> compressed = + detail::Compress(BitCast(du, v), mask_bits); + BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); + return count; +} + +// ------------------------------ CompressBitsStore + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (d.MaxLanes() < 8) { + mask_bits &= (1ull << d.MaxLanes()) - 1; + } + + StoreU(detail::Compress(v, mask_bits), d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ LoadInterleaved2 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT +#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from + +#if HWY_ARCH_ARM_A64 +#define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4) +#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES +#else +// Exclude 64x2 and f64x1, which are only supported on aarch64 +#define HWY_IF_LOAD_INT(D) \ + HWY_IF_V_SIZE_GT_D(D, 4), \ + hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \ + nullptr +#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) +#endif // HWY_ARCH_ARM_A64 + +// Must return raw tuple because Tuple2 lack a ctor, and we cannot use +// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return +// void. +#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ + decltype(Tuple2<type##_t, size>().raw) +// Tuple tag arg allows overloading (cannot just overload on return type) +#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ + const type##_t *from, Tuple2<type##_t, size> +HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT) +#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT +#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT + +#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ + decltype(Tuple3<type##_t, size>().raw) +#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ + const type##_t *from, Tuple3<type##_t, size> +HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT +#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT + +#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ + decltype(Tuple4<type##_t, size>().raw) +#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ + const type##_t *from, Tuple4<type##_t, size> +HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT +#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT + +#undef HWY_NEON_DEF_FUNCTION_LOAD_INT +#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT +#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT +} // namespace detail + +template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>> +HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1) { + auto raw = + detail::LoadInterleaved2(unaligned, detail::Tuple2<T, d.MaxLanes()>()); + v0 = VFromD<D>(raw.val[0]); + v1 = VFromD<D>(raw.val[1]); +} + +// <= 32 bits: avoid loading more than N bytes by copying to buffer +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1) { + // The smallest vector registers are 64-bits and we want space for two. + alignas(16) T buf[2 * 8 / sizeof(T)] = {}; + CopyBytes<d.MaxBytes() * 2>(unaligned, buf); + auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, d.MaxLanes()>()); + v0 = VFromD<D>(raw.val[0]); + v1 = VFromD<D>(raw.val[1]); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0, + Vec128<T>& v1) { + const Half<decltype(d)> dh; + VFromD<decltype(dh)> v00, v10, v01, v11; + LoadInterleaved2(dh, unaligned, v00, v10); + LoadInterleaved2(dh, unaligned + 2, v01, v11); + v0 = Combine(d, v01, v00); + v1 = Combine(d, v11, v10); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ LoadInterleaved3 + +template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>> +HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + auto raw = + detail::LoadInterleaved3(unaligned, detail::Tuple3<T, d.MaxLanes()>()); + v0 = VFromD<D>(raw.val[0]); + v1 = VFromD<D>(raw.val[1]); + v2 = VFromD<D>(raw.val[2]); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + // The smallest vector registers are 64-bits and we want space for three. + alignas(16) T buf[3 * 8 / sizeof(T)] = {}; + CopyBytes<d.MaxBytes() * 3>(unaligned, buf); + auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, d.MaxLanes()>()); + v0 = VFromD<D>(raw.val[0]); + v1 = VFromD<D>(raw.val[1]); + v2 = VFromD<D>(raw.val[2]); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) { + const Half<decltype(d)> dh; + VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21; + LoadInterleaved3(dh, unaligned, v00, v10, v20); + LoadInterleaved3(dh, unaligned + 3, v01, v11, v21); + v0 = Combine(d, v01, v00); + v1 = Combine(d, v11, v10); + v2 = Combine(d, v21, v20); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ LoadInterleaved4 + +template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>> +HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + auto raw = + detail::LoadInterleaved4(unaligned, detail::Tuple4<T, d.MaxLanes()>()); + v0 = VFromD<D>(raw.val[0]); + v1 = VFromD<D>(raw.val[1]); + v2 = VFromD<D>(raw.val[2]); + v3 = VFromD<D>(raw.val[3]); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + alignas(16) T buf[4 * 8 / sizeof(T)] = {}; + CopyBytes<d.MaxBytes() * 4>(unaligned, buf); + auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, d.MaxLanes()>()); + v0 = VFromD<D>(raw.val[0]); + v1 = VFromD<D>(raw.val[1]); + v2 = VFromD<D>(raw.val[2]); + v3 = VFromD<D>(raw.val[3]); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, + Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2, + Vec128<T>& v3) { + const Half<decltype(d)> dh; + VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31; + LoadInterleaved4(dh, unaligned, v00, v10, v20, v30); + LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31); + v0 = Combine(d, v01, v00); + v1 = Combine(d, v11, v10); + v2 = Combine(d, v21, v20); + v3 = Combine(d, v31, v30); +} +#endif // HWY_ARCH_ARM_V7 + +#undef HWY_IF_LOAD_INT + +// ------------------------------ StoreInterleaved2 + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_STORE_INT +#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void +#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw + +#if HWY_ARCH_ARM_A64 +#define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4) +#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES +#else +// Exclude 64x2 and f64x1, which are only supported on aarch64 +#define HWY_IF_STORE_INT(D) \ + HWY_IF_V_SIZE_GT_D(D, 4), \ + hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \ + nullptr +#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) +#endif // HWY_ARCH_ARM_A64 + +#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ + Tuple2<type##_t, size> tup, type##_t *to +HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT + +#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ + Tuple3<type##_t, size> tup, type##_t *to +HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT + +#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ + Tuple4<type##_t, size> tup, type##_t *to +HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT + +#undef HWY_NEON_DEF_FUNCTION_STORE_INT +#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT +#undef HWY_NEON_BUILD_RET_HWY_STORE_INT +#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT +} // namespace detail + +template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + T* HWY_RESTRICT unaligned) { + detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}}; + detail::StoreInterleaved2(tup, unaligned); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + T* HWY_RESTRICT unaligned) { + alignas(16) T buf[2 * 8 / sizeof(T)]; + detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}}; + detail::StoreInterleaved2(tup, buf); + CopyBytes<d.MaxBytes() * 2>(buf, unaligned); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d, + T* HWY_RESTRICT unaligned) { + const Half<decltype(d)> dh; + StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned); + StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ StoreInterleaved3 + +template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + T* HWY_RESTRICT unaligned) { + detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}}; + detail::StoreInterleaved3(tup, unaligned); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + T* HWY_RESTRICT unaligned) { + alignas(16) T buf[3 * 8 / sizeof(T)]; + detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}}; + detail::StoreInterleaved3(tup, buf); + CopyBytes<d.MaxBytes() * 3>(buf, unaligned); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d, + T* HWY_RESTRICT unaligned) { + const Half<decltype(d)> dh; + StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh, + unaligned); + StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh, + unaligned + 3); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ StoreInterleaved4 + +template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>> +HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { + detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; + detail::StoreInterleaved4(tup, unaligned); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { + alignas(16) T buf[4 * 8 / sizeof(T)]; + detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; + detail::StoreInterleaved4(tup, buf); + CopyBytes<d.MaxBytes() * 4>(buf, unaligned); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, + Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) { + const Half<decltype(d)> dh; + StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), + LowerHalf(dh, v3), dh, unaligned); + StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), + UpperHalf(dh, v3), dh, unaligned + 4); +} +#endif // HWY_ARCH_ARM_V7 + +#undef HWY_IF_STORE_INT + +// ------------------------------ Lt128 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64"); + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const MFromD<D> eqHL = Eq(a, b); + const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); + // We need to bring cL to the upper lane/bit corresponding to cH. Comparing + // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the + // comparison result leftwards requires only 4. IfThenElse compiles to the + // same code as OrAnd(). + const VFromD<D> ltLx = DupEven(ltHL); + const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL); + return MaskFromVec(DupOdd(outHx)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); + return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); +} + +// ------------------------------ Eq128 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64"); + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); +} + +// ------------------------------ Ne128 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64"); + const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(Or(Reverse2(d, neHL), neHL)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(InterleaveUpper(d, neHL, neHL)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Without a native OddEven, it seems infeasible to go faster than Lt128. +template <class D> +HWY_INLINE VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128(d, a, b), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128(d, b, a), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +#ifdef HWY_NATIVE_LEADING_ZERO_COUNT +#undef HWY_NATIVE_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_LEADING_ZERO_COUNT +#endif + +HWY_NEON_DEF_FUNCTION_INT_8_16_32(LeadingZeroCount, vclz, _, 1) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(LeadingZeroCount, vclz, _, 1) + +template <class V, HWY_IF_UI64_D(DFromV<V>)> +HWY_API V LeadingZeroCount(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const Repartition<uint32_t, decltype(d)> du32; + + const auto v_k32 = BitCast(du32, Set(du, 32)); + const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32; + const auto v_u32_lo_lzcnt = + And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu))); + const auto v_u32_hi_lzcnt = + BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt))); + + return BitCast( + d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt)); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V HighestSetBitIndex(V v) { + const DFromV<decltype(v)> d; + using T = TFromD<decltype(d)>; + return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, 1)> +HWY_API V TrailingZeroCount(V v) { + return LeadingZeroCount(ReverseBits(v)); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API V TrailingZeroCount(V v) { + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> du8; + return LeadingZeroCount( + ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))))); +} + +namespace detail { // for code folding +#if HWY_ARCH_ARM_V7 +#undef vuzp1_s8 +#undef vuzp1_u8 +#undef vuzp1_s16 +#undef vuzp1_u16 +#undef vuzp1_s32 +#undef vuzp1_u32 +#undef vuzp1_f32 +#undef vuzp1q_s8 +#undef vuzp1q_u8 +#undef vuzp1q_s16 +#undef vuzp1q_u16 +#undef vuzp1q_s32 +#undef vuzp1q_u32 +#undef vuzp1q_f32 +#undef vuzp2_s8 +#undef vuzp2_u8 +#undef vuzp2_s16 +#undef vuzp2_u16 +#undef vuzp2_s32 +#undef vuzp2_u32 +#undef vuzp2_f32 +#undef vuzp2q_s8 +#undef vuzp2q_u8 +#undef vuzp2q_s16 +#undef vuzp2q_u16 +#undef vuzp2q_s32 +#undef vuzp2q_u32 +#undef vuzp2q_f32 +#undef vzip1_s8 +#undef vzip1_u8 +#undef vzip1_s16 +#undef vzip1_u16 +#undef vzip1_s32 +#undef vzip1_u32 +#undef vzip1_f32 +#undef vzip1q_s8 +#undef vzip1q_u8 +#undef vzip1q_s16 +#undef vzip1q_u16 +#undef vzip1q_s32 +#undef vzip1q_u32 +#undef vzip1q_f32 +#undef vzip2_s8 +#undef vzip2_u8 +#undef vzip2_s16 +#undef vzip2_u16 +#undef vzip2_s32 +#undef vzip2_u32 +#undef vzip2_f32 +#undef vzip2q_s8 +#undef vzip2q_u8 +#undef vzip2q_s16 +#undef vzip2q_u16 +#undef vzip2q_s32 +#undef vzip2q_u32 +#undef vzip2q_f32 +#endif + +#undef HWY_NEON_BUILD_ARG_1 +#undef HWY_NEON_BUILD_ARG_2 +#undef HWY_NEON_BUILD_ARG_3 +#undef HWY_NEON_BUILD_PARAM_1 +#undef HWY_NEON_BUILD_PARAM_2 +#undef HWY_NEON_BUILD_PARAM_3 +#undef HWY_NEON_BUILD_RET_1 +#undef HWY_NEON_BUILD_RET_2 +#undef HWY_NEON_BUILD_RET_3 +#undef HWY_NEON_BUILD_TPL_1 +#undef HWY_NEON_BUILD_TPL_2 +#undef HWY_NEON_BUILD_TPL_3 +#undef HWY_NEON_DEF_FUNCTION +#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS +#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES +#undef HWY_NEON_DEF_FUNCTION_FLOAT_64 +#undef HWY_NEON_DEF_FUNCTION_FULL_UI +#undef HWY_NEON_DEF_FUNCTION_INT_16 +#undef HWY_NEON_DEF_FUNCTION_INT_32 +#undef HWY_NEON_DEF_FUNCTION_INT_8 +#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32 +#undef HWY_NEON_DEF_FUNCTION_INTS +#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS +#undef HWY_NEON_DEF_FUNCTION_TPL +#undef HWY_NEON_DEF_FUNCTION_UIF81632 +#undef HWY_NEON_DEF_FUNCTION_UINT_16 +#undef HWY_NEON_DEF_FUNCTION_UINT_32 +#undef HWY_NEON_DEF_FUNCTION_UINT_8 +#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 +#undef HWY_NEON_DEF_FUNCTION_UINTS +#undef HWY_NEON_EVAL +} // namespace detail + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/arm_sve-inl.h b/third_party/highway/hwy/ops/arm_sve-inl.h new file mode 100644 index 0000000000..6b69a3af30 --- /dev/null +++ b/third_party/highway/hwy/ops/arm_sve-inl.h @@ -0,0 +1,4596 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Arm SVE[2] vectors (length not known at compile time). +// External include guard in highway.h - see comment there. + +#include <arm_sve.h> + +#include "hwy/ops/shared-inl.h" + +// Arm C215 declares that SVE vector lengths will always be a power of two. +// We default to relying on this, which makes some operations more efficient. +// You can still opt into fixups by setting this to 0 (unsupported). +#ifndef HWY_SVE_IS_POW2 +#define HWY_SVE_IS_POW2 1 +#endif + +#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 +#define HWY_SVE_HAVE_2 1 +#else +#define HWY_SVE_HAVE_2 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <class V> +struct DFromV_t {}; // specialized in macros +template <class V> +using DFromV = typename DFromV_t<RemoveConst<V>>::type; + +template <class V> +using TFromV = TFromD<DFromV<V>>; + +// ================================================== MACROS + +// Generate specializations and function definitions using X macros. Although +// harder to read and debug, writing everything manually is too bulky. + +namespace detail { // for code folding + +// Args: BASE, CHAR, BITS, HALF, NAME, OP + +// Unsigned: +#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP) +#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP) +#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ + X_MACRO(uint, u, 32, 16, NAME, OP) +#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \ + X_MACRO(uint, u, 64, 32, NAME, OP) + +// Signed: +#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP) +#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP) +#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP) +#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP) + +// Float: +#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \ + X_MACRO(float, f, 16, 16, NAME, OP) +#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ + X_MACRO(float, f, 32, 16, NAME, OP) +#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \ + X_MACRO(float, f, 64, 32, NAME, OP) + +// For all element sizes: +#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) + +// Commonly used type categories for a given element size: +#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) + +// Commonly used type categories: +#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) + +// Assemble types for use in x-macros +#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t +#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2> +#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t +#define HWY_SVE_TUPLE(BASE, BITS, MUL) sv##BASE##BITS##x##MUL##_t + +} // namespace detail + +#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <> \ + struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \ + using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \ + }; + +HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _) +#undef HWY_SPECIALIZE + +// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX +// instructions, and we anyway only use it when the predicate is ptrue. + +// vector = f(vector), e.g. Not +#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } +#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(v); \ + } + +// vector = f(vector, scalar), e.g. detail::AddN +#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \ + } +#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(a, b); \ + } + +// vector = f(vector, vector), e.g. Add +#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \ + } +#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(a, b); \ + } + +#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ + HWY_SVE_V(BASE, BITS) c) { \ + return sv##OP##_##CHAR##BITS(a, b, c); \ + } + +// ------------------------------ Lanes + +namespace detail { + +// Returns actual lanes of a hardware vector without rounding to a power of two. +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE size_t AllHardwareLanes() { + return svcntb_pat(SV_ALL); +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE size_t AllHardwareLanes() { + return svcnth_pat(SV_ALL); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE size_t AllHardwareLanes() { + return svcntw_pat(SV_ALL); +} +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE size_t AllHardwareLanes() { + return svcntd_pat(SV_ALL); +} + +// All-true mask from a macro + +#if HWY_SVE_IS_POW2 +#define HWY_SVE_ALL_PTRUE(BITS) svptrue_b##BITS() +#define HWY_SVE_PTRUE(BITS) svptrue_b##BITS() +#else +#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL) +#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2) +#endif // HWY_SVE_IS_POW2 + +} // namespace detail + +#if HWY_HAVE_SCALABLE + +// Returns actual number of lanes after capping by N and shifting. May return 0 +// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8). +template <typename T, size_t N, int kPow2> +HWY_API size_t Lanes(Simd<T, N, kPow2> d) { + const size_t actual = detail::AllHardwareLanes<T>(); + // Common case of full vectors: avoid any extra instructions. + if (detail::IsFull(d)) return actual; + return detail::ScaleByPower(HWY_MIN(actual, N), kPow2); +} + +#endif // HWY_HAVE_SCALABLE + +// ================================================== MASK INIT + +// One mask bit per byte; only the one belonging to the lowest byte is valid. + +// ------------------------------ FirstN +#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \ + const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \ + return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \ + } +HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt) +#undef HWY_SVE_FIRSTN + +template <class D> +using MFromD = decltype(FirstN(D(), 0)); + +namespace detail { + +#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ + return HWY_SVE_PTRUE(BITS); \ + } \ + template <size_t N, int kPow2> \ + HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ + return HWY_SVE_ALL_PTRUE(BITS); \ + } + +HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true +#undef HWY_SVE_WRAP_PTRUE + +HWY_API svbool_t PFalse() { return svpfalse_b(); } + +// Returns all-true if d is HWY_FULL or FirstN(N) after capping N. +// +// This is used in functions that load/store memory; other functions (e.g. +// arithmetic) can ignore d and use PTrue instead. +template <class D> +svbool_t MakeMask(D d) { + return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d)); +} + +} // namespace detail + +// ================================================== INIT + +// ------------------------------ Set +// vector = f(d, scalar), e.g. Set +#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_T(BASE, BITS) arg) { \ + return sv##OP##_##CHAR##BITS(arg); \ + } + +HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n) +#undef HWY_SVE_SET + +// Required for Zero and VFromD +template <size_t N, int kPow2> +svuint16_t Set(Simd<bfloat16_t, N, kPow2> d, bfloat16_t arg) { + return Set(RebindToUnsigned<decltype(d)>(), arg.bits); +} + +template <class D> +using VFromD = decltype(Set(D(), TFromD<D>())); + +// ------------------------------ Zero + +template <class D> +VFromD<D> Zero(D d) { + // Cast to support bfloat16_t. + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Set(du, 0)); +} + +// ------------------------------ Undefined + +#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ + return sv##OP##_##CHAR##BITS(); \ + } + +HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef) + +// ------------------------------ BitCast + +namespace detail { + +// u8: no change +#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \ + return v; \ + } \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \ + return v; \ + } + +// All other types +#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_u8_##CHAR##BITS(v); \ + } \ + template <size_t N, int kPow2> \ + HWY_INLINE HWY_SVE_V(BASE, BITS) \ + BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \ + return sv##OP##_##CHAR##BITS##_u8(v); \ + } + +HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _) +HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret) + +#undef HWY_SVE_CAST_NOP +#undef HWY_SVE_CAST + +template <size_t N, int kPow2> +HWY_INLINE svuint16_t BitCastFromByte(Simd<bfloat16_t, N, kPow2> /* d */, + svuint8_t v) { + return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v); +} + +} // namespace detail + +template <class D, class FromV> +HWY_API VFromD<D> BitCast(D d, FromV v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Tuple + +// tuples = f(d, v..), e.g. Create2 +#define HWY_SVE_CREATE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \ + NAME##2(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1) { \ + return sv##OP##2_##CHAR##BITS(v0, v1); \ + } \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) NAME##3( \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v0, \ + HWY_SVE_V(BASE, BITS) v1, HWY_SVE_V(BASE, BITS) v2) { \ + return sv##OP##3_##CHAR##BITS(v0, v1, v2); \ + } \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \ + NAME##4(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3) { \ + return sv##OP##4_##CHAR##BITS(v0, v1, v2, v3); \ + } + +HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create) +// bfloat16 is not included in FOREACH. +HWY_SVE_CREATE(bfloat, bf, 16, 8, Create, create) +#undef HWY_SVE_CREATE + +template <class D> +using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D()))); +template <class D> +using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D()))); +template <class D> +using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D()))); + +#define HWY_SVE_GET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t kIndex> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple) { \ + return sv##OP##2_##CHAR##BITS(tuple, kIndex); \ + } \ + template <size_t kIndex> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple) { \ + return sv##OP##3_##CHAR##BITS(tuple, kIndex); \ + } \ + template <size_t kIndex> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple) { \ + return sv##OP##4_##CHAR##BITS(tuple, kIndex); \ + } + +HWY_SVE_FOREACH(HWY_SVE_GET, Get, get) +// bfloat16 is not included in FOREACH. +HWY_SVE_GET(bfloat, bf, 16, 8, Get, get) +#undef HWY_SVE_GET + +// ------------------------------ ResizeBitCast + +// Same as BitCast on SVE +template <class D, class FromV> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, v); +} + +// ================================================== LOGICAL + +// detail::*N() functions accept a scalar argument to avoid extra Set(). + +// ------------------------------ Not +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not ) // NOLINT + +// ------------------------------ And + +namespace detail { +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, AndN, and_n) +} // namespace detail + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, And, and) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V And(const V a, const V b) { + const DFromV<V> df; + const RebindToUnsigned<decltype(df)> du; + return BitCast(df, And(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ Or + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V Or(const V a, const V b) { + const DFromV<V> df; + const RebindToUnsigned<decltype(df)> du; + return BitCast(df, Or(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ Xor + +namespace detail { +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, XorN, eor_n) +} // namespace detail + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Xor, eor) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V Xor(const V a, const V b) { + const DFromV<V> df; + const RebindToUnsigned<decltype(df)> du; + return BitCast(df, Xor(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ AndNot + +namespace detail { +#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN_SWAP, AndNotN, bic_n) +#undef HWY_SVE_RETV_ARGPVN_SWAP +} // namespace detail + +#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \ + } +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV_SWAP, AndNot, bic) +#undef HWY_SVE_RETV_ARGPVV_SWAP + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V AndNot(const V a, const V b) { + const DFromV<V> df; + const RebindToUnsigned<decltype(df)> du; + return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ Xor3 + +#if HWY_SVE_HAVE_2 + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGVVV, Xor3, eor3) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V Xor3(const V x1, const V x2, const V x3) { + const DFromV<V> df; + const RebindToUnsigned<decltype(df)> du; + return BitCast(df, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3))); +} + +#else +template <class V> +HWY_API V Xor3(V x1, V x2, V x3) { + return Xor(x1, Xor(x2, x3)); +} +#endif + +// ------------------------------ Or3 +template <class V> +HWY_API V Or3(V o1, V o2, V o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd +template <class V> +HWY_API V OrAnd(const V o, const V a1, const V a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ PopulationCount + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +// Need to return original type instead of unsigned. +#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return BitCast(DFromV<decltype(v)>(), \ + sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \ + } +HWY_SVE_FOREACH_UI(HWY_SVE_POPCNT, PopulationCount, cnt) +#undef HWY_SVE_POPCNT + +// ================================================== SIGN + +// ------------------------------ Neg +HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Neg, neg) + +// ------------------------------ Abs +HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs) + +// ------------------------------ CopySign[ToAbs] + +template <class V> +HWY_API V CopySign(const V magn, const V sign) { + const auto msb = SignBit(DFromV<V>()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template <class V> +HWY_API V CopySignToAbs(const V abs, const V sign) { + const auto msb = SignBit(DFromV<V>()); + return Or(abs, And(msb, sign)); +} + +// ================================================== ARITHMETIC + +// Per-target flags to prevent generic_ops-inl.h defining Add etc. +#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS +#undef HWY_NATIVE_OPERATOR_REPLACEMENTS +#else +#define HWY_NATIVE_OPERATOR_REPLACEMENTS +#endif + +// ------------------------------ Add + +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN, AddN, add_n) +} // namespace detail + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Add, add) + +// ------------------------------ Sub + +namespace detail { +// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg. +#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_z(pg, a, b); \ + } + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN_MASK, SubN, sub_n) +#undef HWY_SVE_RETV_ARGPVN_MASK +} // namespace detail + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Sub, sub) + +// ------------------------------ SumsOf8 +HWY_API svuint64_t SumsOf8(const svuint8_t v) { + const ScalableTag<uint32_t> du32; + const ScalableTag<uint64_t> du64; + const svbool_t pg = detail::PTrue(du64); + + const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1); + // Compute pairwise sum of u32 and extend to u64. + // TODO(janwas): on SVE2, we can instead use svaddp. + const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32); + // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended) + const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4)); + return Add(hi, lo); +} + +// ------------------------------ SaturatedAdd + +#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB +#undef HWY_NATIVE_I32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB +#undef HWY_NATIVE_U32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB +#undef HWY_NATIVE_I64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I64_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB +#undef HWY_NATIVE_U64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U64_SATURATED_ADDSUB +#endif + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGVV, SaturatedAdd, qadd) + +// ------------------------------ SaturatedSub + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGVV, SaturatedSub, qsub) + +// ------------------------------ AbsDiff +#ifdef HWY_NATIVE_INTEGER_ABS_DIFF +#undef HWY_NATIVE_INTEGER_ABS_DIFF +#else +#define HWY_NATIVE_INTEGER_ABS_DIFF +#endif + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, AbsDiff, abd) + +// ------------------------------ ShiftLeft[Same] + +#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <int kBits> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \ + } \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n) + +// ------------------------------ ShiftRight[Same] + +HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_N, ShiftRight, lsr_n) +HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n) + +#undef HWY_SVE_SHIFT_N + +// ------------------------------ RotateRight + +// TODO(janwas): svxar on SVE2 +template <int kBits, class V> +HWY_API V RotateRight(const V v) { + constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +// ------------------------------ Shl/r + +#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \ + const RebindToUnsigned<DFromV<decltype(v)>> du; \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \ + BitCast(du, bits)); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT, Shl, lsl) + +HWY_SVE_FOREACH_U(HWY_SVE_SHIFT, Shr, lsr) +HWY_SVE_FOREACH_I(HWY_SVE_SHIFT, Shr, asr) + +#undef HWY_SVE_SHIFT + +// ------------------------------ Min/Max + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Min, min) +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm) + +namespace detail { +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n) +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n) +} // namespace detail + +// ------------------------------ Mul + +// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Mul, mul) + +// ------------------------------ MulHigh +HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) +// Not part of API, used internally: +HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) +HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) + +// ------------------------------ MulFixedPoint15 +HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) { +#if HWY_SVE_HAVE_2 + return svqrdmulh_s16(a, b); +#else + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + + const svuint16_t lo = BitCast(du, Mul(a, b)); + const svint16_t hi = MulHigh(a, b); + // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must + // carry that into the result. Instead isolate the top two bits because only + // they can influence the result. + const svuint16_t lo_top2 = ShiftRight<14>(lo); + // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0. + const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1)); + return Add(Add(hi, hi), BitCast(d, rounding)); +#endif +} + +// ------------------------------ Div +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div) + +// ------------------------------ ApproximateReciprocal +HWY_SVE_FOREACH_F32(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe) + +// ------------------------------ Sqrt +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt) + +// ------------------------------ ApproximateReciprocalSqrt +HWY_SVE_FOREACH_F32(HWY_SVE_RETV_ARGV, ApproximateReciprocalSqrt, rsqrte) + +// ------------------------------ MulAdd + +// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. +#ifdef HWY_NATIVE_INT_FMA +#undef HWY_NATIVE_INT_FMA +#else +#define HWY_NATIVE_INT_FMA +#endif + +#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \ + HWY_SVE_V(BASE, BITS) add) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \ + } + +HWY_SVE_FOREACH(HWY_SVE_FMA, MulAdd, mad) + +// ------------------------------ NegMulAdd +HWY_SVE_FOREACH(HWY_SVE_FMA, NegMulAdd, msb) + +// ------------------------------ MulSub +HWY_SVE_FOREACH_F(HWY_SVE_FMA, MulSub, nmsb) + +// ------------------------------ NegMulSub +HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulSub, nmad) + +#undef HWY_SVE_FMA + +// ------------------------------ Round etc. + +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Round, rintn) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Floor, rintm) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Ceil, rintp) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Trunc, rintz) + +// ================================================== MASK + +// ------------------------------ RebindMask +template <class D, typename MFrom> +HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) { + return mask; +} + +// ------------------------------ Mask logical + +HWY_API svbool_t Not(svbool_t m) { + // We don't know the lane type, so assume 8-bit. For larger types, this will + // de-canonicalize the predicate, i.e. set bits to 1 even though they do not + // correspond to the lowest byte in the lane. Arm says such bits are ignored. + return svnot_b_z(HWY_SVE_PTRUE(8), m); +} +HWY_API svbool_t And(svbool_t a, svbool_t b) { + return svand_b_z(b, b, a); // same order as AndNot for consistency +} +HWY_API svbool_t AndNot(svbool_t a, svbool_t b) { + return svbic_b_z(b, b, a); // reversed order like NEON +} +HWY_API svbool_t Or(svbool_t a, svbool_t b) { + return svsel_b(a, a, b); // a ? true : b +} +HWY_API svbool_t Xor(svbool_t a, svbool_t b) { + return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b. +} + +HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) { + return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b. +} + +// ------------------------------ CountTrue + +#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \ + return sv##OP##_b##BITS(detail::MakeMask(d), m); \ + } + +HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE, CountTrue, cntp) +#undef HWY_SVE_COUNT_TRUE + +// For 16-bit Compress: full vector, not limited to SV_POW2. +namespace detail { + +#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \ + return sv##OP##_b##BITS(svptrue_b##BITS(), m); \ + } + +HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp) +#undef HWY_SVE_COUNT_TRUE_FULL + +} // namespace detail + +// ------------------------------ AllFalse +template <class D> +HWY_API bool AllFalse(D d, svbool_t m) { + return !svptest_any(detail::MakeMask(d), m); +} + +// ------------------------------ AllTrue +template <class D> +HWY_API bool AllTrue(D d, svbool_t m) { + return CountTrue(d, m) == Lanes(d); +} + +// ------------------------------ FindFirstTrue +template <class D> +HWY_API intptr_t FindFirstTrue(D d, svbool_t m) { + return AllFalse(d, m) ? intptr_t{-1} + : static_cast<intptr_t>( + CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m))); +} + +// ------------------------------ FindKnownFirstTrue +template <class D> +HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) { + return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)); +} + +// ------------------------------ IfThenElse +#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \ + return sv##OP##_##CHAR##BITS(m, yes, no); \ + } + +HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel) +#undef HWY_SVE_IF_THEN_ELSE + +// ------------------------------ IfThenElseZero +template <class V> +HWY_API V IfThenElseZero(const svbool_t mask, const V yes) { + return IfThenElse(mask, yes, Zero(DFromV<V>())); +} + +// ------------------------------ IfThenZeroElse +template <class V> +HWY_API V IfThenZeroElse(const svbool_t mask, const V no) { + return IfThenElse(mask, Zero(DFromV<V>()), no); +} + +// ================================================== COMPARE + +// mask = f(vector, vector) +#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \ + } +#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \ + } + +// ------------------------------ Eq +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Eq, cmpeq) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, EqN, cmpeq_n) +} // namespace detail + +// ------------------------------ Ne +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Ne, cmpne) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, NeN, cmpne_n) +} // namespace detail + +// ------------------------------ Lt +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Lt, cmplt) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LtN, cmplt_n) +} // namespace detail + +// ------------------------------ Le +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Le, cmple) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LeN, cmple_n) +} // namespace detail + +// ------------------------------ Gt/Ge (swapped order) +template <class V> +HWY_API svbool_t Gt(const V a, const V b) { + return Lt(b, a); +} +template <class V> +HWY_API svbool_t Ge(const V a, const V b) { + return Le(b, a); +} +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, GeN, cmpge_n) +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, GtN, cmpgt_n) +} // namespace detail + +#undef HWY_SVE_COMPARE +#undef HWY_SVE_COMPARE_N + +// ------------------------------ TestBit +template <class V> +HWY_API svbool_t TestBit(const V a, const V bit) { + return detail::NeN(And(a, bit), 0); +} + +// ------------------------------ MaskFromVec (Ne) +template <class V> +HWY_API svbool_t MaskFromVec(const V v) { + return detail::NeN(v, static_cast<TFromV<V>>(0)); +} + +// ------------------------------ VecFromMask +template <class D> +HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) { + const RebindToSigned<D> di; + // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which + // requires an extra instruction plus M0 pipeline. + return BitCast(d, IfThenElseZero(mask, Set(di, -1))); +} + +// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse) + +#if HWY_SVE_HAVE_2 + +#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \ + HWY_SVE_V(BASE, BITS) no) { \ + return sv##OP##_##CHAR##BITS(yes, no, mask); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_IF_VEC, IfVecThenElse, bsl) +#undef HWY_SVE_IF_VEC + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast( + d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no))); +} + +#else + +template <class V> +HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { + return Or(And(mask, yes), AndNot(mask, no)); +} + +#endif // HWY_SVE_HAVE_2 + +// ------------------------------ BitwiseIfThenElse + +#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#else +#define HWY_NATIVE_BITWISE_IF_THEN_ELSE +#endif + +template <class V> +HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { + return IfVecThenElse(mask, yes, no); +} + +// ------------------------------ Floating-point classification (Ne) + +template <class V> +HWY_API svbool_t IsNaN(const V v) { + return Ne(v, v); // could also use cmpuo +} + +template <class V> +HWY_API svbool_t IsInf(const V v) { + using T = TFromV<V>; + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>())); +} + +// Returns whether normal/subnormal/zero. +template <class V> +HWY_API svbool_t IsFinite(const V v) { + using T = TFromV<V>; + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); + return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>())); +} + +// ================================================== MEMORY + +// ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream + +#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \ + } + +#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + return sv##OP##_##CHAR##BITS(m, p); \ + } + +#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + /* All-true predicate to load all 128 bits. */ \ + return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \ + } + +#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \ + } + +#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + sv##OP##_##CHAR##BITS(m, p, v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_LOAD, Load, ld1) +HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1) +HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1) +HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1) +HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1) + +#if HWY_TARGET != HWY_SVE2_128 +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq) +} // namespace detail +#endif // HWY_TARGET != HWY_SVE2_128 + +#undef HWY_SVE_LOAD +#undef HWY_SVE_MASKED_LOAD +#undef HWY_SVE_LOAD_DUP128 +#undef HWY_SVE_STORE +#undef HWY_SVE_BLENDED_STORE + +// BF16 is the same as svuint16_t because BF16 is optional before v8.6. +template <size_t N, int kPow2> +HWY_API svuint16_t Load(Simd<bfloat16_t, N, kPow2> d, + const bfloat16_t* HWY_RESTRICT p) { + return Load(RebindToUnsigned<decltype(d)>(), + reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)); +} + +#if HWY_TARGET == HWY_SVE2_128 +// On the HWY_SVE2_128 target, LoadDup128 is the same as Load since vectors +// cannot exceed 16 bytes on the HWY_SVE2_128 target. +template <class D> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} +#else +// If D().MaxBytes() <= 16 is true, simply do a Load operation. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} + +// If D().MaxBytes() > 16 is true, need to load the vector using ld1rq +template <class D, HWY_IF_V_SIZE_GT_D(D, 16), + hwy::EnableIf<!IsSame<TFromD<D>, bfloat16_t>()>* = nullptr> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { + return detail::LoadDupFull128(d, p); +} + +// BF16 is the same as svuint16_t because BF16 is optional before v8.6. +template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_BF16_D(D)> +HWY_API svuint16_t LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) { + return detail::LoadDupFull128( + RebindToUnsigned<decltype(d)>(), + reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)); +} +#endif // HWY_TARGET != HWY_SVE2_128 + +template <size_t N, int kPow2> +HWY_API void Store(svuint16_t v, Simd<bfloat16_t, N, kPow2> d, + bfloat16_t* HWY_RESTRICT p) { + Store(v, RebindToUnsigned<decltype(d)>(), + reinterpret_cast<uint16_t * HWY_RESTRICT>(p)); +} + +// ------------------------------ Load/StoreU + +// SVE only requires lane alignment, not natural alignment of the entire +// vector. +template <class D> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} + +template <class V, class D> +HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) { + Store(v, d, p); +} + +// ------------------------------ MaskedLoadOr + +// SVE MaskedLoad hard-codes zero, so this requires an extra blend. +template <class D> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, + const TFromD<D>* HWY_RESTRICT p) { + return IfThenElse(m, MaskedLoad(m, d, p), v); +} + +// ------------------------------ ScatterOffset/Index + +#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ + HWY_SVE_V(int, BITS) offset) { \ + sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \ + v); \ + } + +#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME( \ + HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \ + sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \ + } + +HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter) +HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_INDEX, ScatterIndex, st1_scatter) +#undef HWY_SVE_SCATTER_OFFSET +#undef HWY_SVE_SCATTER_INDEX + +// ------------------------------ GatherOffset/Index + +#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ + HWY_SVE_V(int, BITS) offset) { \ + return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \ + offset); \ + } +#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ + HWY_SVE_V(int, BITS) index) { \ + return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \ + index); \ + } + +HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather) +HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_INDEX, GatherIndex, ld1_gather) +#undef HWY_SVE_GATHER_OFFSET +#undef HWY_SVE_GATHER_INDEX + +// ------------------------------ LoadInterleaved2 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ + HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \ + const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ + v0 = svget2(tuple, 0); \ + v1 = svget2(tuple, 1); \ + } +HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2) + +#undef HWY_SVE_LOAD2 + +// ------------------------------ LoadInterleaved3 + +#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ + HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \ + HWY_SVE_V(BASE, BITS) & v2) { \ + const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ + v0 = svget3(tuple, 0); \ + v1 = svget3(tuple, 1); \ + v2 = svget3(tuple, 2); \ + } +HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3) + +#undef HWY_SVE_LOAD3 + +// ------------------------------ LoadInterleaved4 + +#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ + HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \ + HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \ + const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ + v0 = svget4(tuple, 0); \ + v1 = svget4(tuple, 1); \ + v2 = svget4(tuple, 2); \ + v3 = svget4(tuple, 3); \ + } +HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4) + +#undef HWY_SVE_LOAD4 + +// ------------------------------ StoreInterleaved2 + +#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, Create2(d, v0, v1)); \ + } +HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2) + +#undef HWY_SVE_STORE2 + +// ------------------------------ StoreInterleaved3 + +#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_V(BASE, BITS) v2, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \ + Create3(d, v0, v1, v2)); \ + } +HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3) + +#undef HWY_SVE_STORE3 + +// ------------------------------ StoreInterleaved4 + +#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \ + Create4(d, v0, v1, v2, v3)); \ + } +HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4) + +#undef HWY_SVE_STORE4 + +// ================================================== CONVERT + +// ------------------------------ PromoteTo + +// Same sign +#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME( \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \ + return sv##OP##_##CHAR##BITS(v); \ + } + +HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) +HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) +HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) + +// 2x +template <size_t N, int kPow2> +HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) { + const RepartitionToWide<DFromV<decltype(vfrom)>> d2; + return PromoteTo(dto, PromoteTo(d2, vfrom)); +} +template <size_t N, int kPow2> +HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) { + const RepartitionToWide<DFromV<decltype(vfrom)>> d2; + return PromoteTo(dto, PromoteTo(d2, vfrom)); +} +template <size_t N, int kPow2> +HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> dto, svuint16_t vfrom) { + const RepartitionToWide<DFromV<decltype(vfrom)>> d2; + return PromoteTo(dto, PromoteTo(d2, vfrom)); +} +template <size_t N, int kPow2> +HWY_API svint64_t PromoteTo(Simd<int64_t, N, kPow2> dto, svint16_t vfrom) { + const RepartitionToWide<DFromV<decltype(vfrom)>> d2; + return PromoteTo(dto, PromoteTo(d2, vfrom)); +} + +// 3x +template <size_t N, int kPow2> +HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> dto, svuint8_t vfrom) { + const RepartitionToNarrow<decltype(dto)> d4; + const RepartitionToNarrow<decltype(d4)> d2; + return PromoteTo(dto, PromoteTo(d4, PromoteTo(d2, vfrom))); +} +template <size_t N, int kPow2> +HWY_API svint64_t PromoteTo(Simd<int64_t, N, kPow2> dto, svint8_t vfrom) { + const RepartitionToNarrow<decltype(dto)> d4; + const RepartitionToNarrow<decltype(d4)> d2; + return PromoteTo(dto, PromoteTo(d4, PromoteTo(d2, vfrom))); +} + +// Sign change +template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V), + HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>))> +HWY_API VFromD<D> PromoteTo(D di, V v) { + const RebindToUnsigned<decltype(di)> du; + return BitCast(di, PromoteTo(du, v)); +} + +// ------------------------------ PromoteTo F + +// Unlike Highway's ZipLower, this returns the same type. +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1) +} // namespace detail + +template <size_t N, int kPow2> +HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */, + const svfloat16_t v) { + // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so + // first replicate each lane once. + const svfloat16_t vv = detail::ZipLowerSame(v, v); + return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv); +} + +template <size_t N, int kPow2> +HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */, + const svfloat32_t v) { + const svfloat32_t vv = detail::ZipLowerSame(v, v); + return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv); +} + +template <size_t N, int kPow2> +HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */, + const svint32_t v) { + const svint32_t vv = detail::ZipLowerSame(v, v); + return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv); +} + +// For 16-bit Compress +namespace detail { +HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi) +#undef HWY_SVE_PROMOTE_TO + +template <size_t N, int kPow2> +HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) { + const RebindToUnsigned<decltype(df)> du; + const RepartitionToNarrow<decltype(du)> dn; + return BitCast(df, PromoteUpperTo(du, BitCast(dn, v))); +} + +} // namespace detail + +// ------------------------------ DemoteTo U + +namespace detail { + +// Saturates unsigned vectors to half/quarter-width TN. +template <typename TN, class VU> +VU SaturateU(VU v) { + return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>())); +} + +// Saturates unsigned vectors to half/quarter-width TN. +template <typename TN, class VI> +VI SaturateI(VI v) { + return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>()); +} + +} // namespace detail + +template <size_t N, int kPow2> +HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) { +#if HWY_SVE_HAVE_2 + const svuint8_t vn = BitCast(dn, svqxtunb_s16(v)); +#else + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + using TN = TFromD<decltype(dn)>; + // First clamp negative numbers to zero and cast to unsigned. + const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0)); + // Saturate to unsigned-max and halve the width. + const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped)); +#endif + return svuzp1_u8(vn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) { +#if HWY_SVE_HAVE_2 + const svuint16_t vn = BitCast(dn, svqxtunb_s32(v)); +#else + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + using TN = TFromD<decltype(dn)>; + // First clamp negative numbers to zero and cast to unsigned. + const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0)); + // Saturate to unsigned-max and halve the width. + const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped)); +#endif + return svuzp1_u16(vn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const RepartitionToNarrow<decltype(du)> d2; +#if HWY_SVE_HAVE_2 + const svuint16_t cast16 = BitCast(d2, svqxtnb_u16(svqxtunb_s32(v))); +#else + using TN = TFromD<decltype(dn)>; + // First clamp negative numbers to zero and cast to unsigned. + const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0)); + // Saturate to unsigned-max and quarter the width. + const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped)); +#endif + const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16)); + return svuzp1_u8(x2, x2); +} + +HWY_API svuint8_t U8FromU32(const svuint32_t v) { + const DFromV<svuint32_t> du32; + const RepartitionToNarrow<decltype(du32)> du16; + const RepartitionToNarrow<decltype(du16)> du8; + + const svuint16_t cast16 = BitCast(du16, v); + const svuint16_t x2 = svuzp1_u16(cast16, cast16); + const svuint8_t cast8 = BitCast(du8, x2); + return svuzp1_u8(cast8, cast8); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint16_t v) { +#if HWY_SVE_HAVE_2 + const svuint8_t vn = BitCast(dn, svqxtnb_u16(v)); +#else + using TN = TFromD<decltype(dn)>; + const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(v)); +#endif + return svuzp1_u8(vn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svuint32_t v) { +#if HWY_SVE_HAVE_2 + const svuint16_t vn = BitCast(dn, svqxtnb_u32(v)); +#else + using TN = TFromD<decltype(dn)>; + const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(v)); +#endif + return svuzp1_u16(vn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint32_t v) { + using TN = TFromD<decltype(dn)>; + return U8FromU32(detail::SaturateU<TN>(v)); +} + +// ------------------------------ Truncations + +template <size_t N, int kPow2> +HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */, + const svuint64_t v) { + const DFromV<svuint8_t> d; + const svuint8_t v1 = BitCast(d, v); + const svuint8_t v2 = svuzp1_u8(v1, v1); + const svuint8_t v3 = svuzp1_u8(v2, v2); + return svuzp1_u8(v3, v3); +} + +template <size_t N, int kPow2> +HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */, + const svuint64_t v) { + const DFromV<svuint16_t> d; + const svuint16_t v1 = BitCast(d, v); + const svuint16_t v2 = svuzp1_u16(v1, v1); + return svuzp1_u16(v2, v2); +} + +template <size_t N, int kPow2> +HWY_API svuint32_t TruncateTo(Simd<uint32_t, N, kPow2> /* tag */, + const svuint64_t v) { + const DFromV<svuint32_t> d; + const svuint32_t v1 = BitCast(d, v); + return svuzp1_u32(v1, v1); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */, + const svuint32_t v) { + const DFromV<svuint8_t> d; + const svuint8_t v1 = BitCast(d, v); + const svuint8_t v2 = svuzp1_u8(v1, v1); + return svuzp1_u8(v2, v2); +} + +template <size_t N, int kPow2> +HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */, + const svuint32_t v) { + const DFromV<svuint16_t> d; + const svuint16_t v1 = BitCast(d, v); + return svuzp1_u16(v1, v1); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */, + const svuint16_t v) { + const DFromV<svuint8_t> d; + const svuint8_t v1 = BitCast(d, v); + return svuzp1_u8(v1, v1); +} + +// ------------------------------ DemoteTo I + +template <size_t N, int kPow2> +HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) { +#if HWY_SVE_HAVE_2 + const svint8_t vn = BitCast(dn, svqxtnb_s16(v)); +#else + using TN = TFromD<decltype(dn)>; + const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v)); +#endif + return svuzp1_s8(vn, vn); +} + +template <size_t N, int kPow2> +HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) { +#if HWY_SVE_HAVE_2 + const svint16_t vn = BitCast(dn, svqxtnb_s32(v)); +#else + using TN = TFromD<decltype(dn)>; + const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v)); +#endif + return svuzp1_s16(vn, vn); +} + +template <size_t N, int kPow2> +HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) { + const RepartitionToWide<decltype(dn)> d2; +#if HWY_SVE_HAVE_2 + const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v))); +#else + using TN = TFromD<decltype(dn)>; + const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v)); +#endif + const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16)); + return BitCast(dn, svuzp1_s8(v2, v2)); +} + +// ------------------------------ I64/U64 DemoteTo + +template <size_t N, int kPow2> +HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> dn, const svint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; + const RebindToUnsigned<decltype(dn)> dn_u; +#if HWY_SVE_HAVE_2 + const svuint64_t vn = BitCast(du64, svqxtnb_s64(v)); +#else + using TN = TFromD<decltype(dn)>; + const svuint64_t vn = BitCast(du64, detail::SaturateI<TN>(v)); +#endif + return BitCast(dn, TruncateTo(dn_u, vn)); +} + +template <size_t N, int kPow2> +HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; + const RebindToUnsigned<decltype(dn)> dn_u; +#if HWY_SVE_HAVE_2 + const svuint64_t vn = BitCast(du64, svqxtnb_s32(svqxtnb_s64(v))); +#else + using TN = TFromD<decltype(dn)>; + const svuint64_t vn = BitCast(du64, detail::SaturateI<TN>(v)); +#endif + return BitCast(dn, TruncateTo(dn_u, vn)); +} + +template <size_t N, int kPow2> +HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; + const RebindToUnsigned<decltype(dn)> dn_u; + using TN = TFromD<decltype(dn)>; + const svuint64_t vn = BitCast(du64, detail::SaturateI<TN>(v)); + return BitCast(dn, TruncateTo(dn_u, vn)); +} + +template <size_t N, int kPow2> +HWY_API svuint32_t DemoteTo(Simd<uint32_t, N, kPow2> dn, const svint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; +#if HWY_SVE_HAVE_2 + const svuint64_t vn = BitCast(du64, svqxtunb_s64(v)); +#else + using TN = TFromD<decltype(dn)>; + // First clamp negative numbers to zero and cast to unsigned. + const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0)); + // Saturate to unsigned-max + const svuint64_t vn = detail::SaturateU<TN>(clamped); +#endif + return TruncateTo(dn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; +#if HWY_SVE_HAVE_2 + const svuint64_t vn = BitCast(du64, svqxtnb_u32(svqxtunb_s64(v))); +#else + using TN = TFromD<decltype(dn)>; + // First clamp negative numbers to zero and cast to unsigned. + const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0)); + // Saturate to unsigned-max + const svuint64_t vn = detail::SaturateU<TN>(clamped); +#endif + return TruncateTo(dn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; + using TN = TFromD<decltype(dn)>; + // First clamp negative numbers to zero and cast to unsigned. + const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0)); + // Saturate to unsigned-max + const svuint64_t vn = detail::SaturateU<TN>(clamped); + return TruncateTo(dn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint32_t DemoteTo(Simd<uint32_t, N, kPow2> dn, const svuint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; +#if HWY_SVE_HAVE_2 + const svuint64_t vn = BitCast(du64, svqxtnb_u64(v)); +#else + using TN = TFromD<decltype(dn)>; + const svuint64_t vn = BitCast(du64, detail::SaturateU<TN>(v)); +#endif + return TruncateTo(dn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svuint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; +#if HWY_SVE_HAVE_2 + const svuint64_t vn = BitCast(du64, svqxtnb_u32(svqxtnb_u64(v))); +#else + using TN = TFromD<decltype(dn)>; + const svuint64_t vn = BitCast(du64, detail::SaturateU<TN>(v)); +#endif + return TruncateTo(dn, vn); +} + +template <size_t N, int kPow2> +HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint64_t v) { + const Rebind<uint64_t, decltype(dn)> du64; + using TN = TFromD<decltype(dn)>; + const svuint64_t vn = BitCast(du64, detail::SaturateU<TN>(v)); + return TruncateTo(dn, vn); +} + +// ------------------------------ ConcatEven/ConcatOdd + +// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the +// full vector length, not rounded down to a power of two as we require). +namespace detail { + +#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ + return sv##OP##_##CHAR##BITS(lo, hi); \ + } +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2) +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q) +#endif +#undef HWY_SVE_CONCAT_EVERY_SECOND + +// Used to slide up / shift whole register left; mask indicates which range +// to take from lo, and the rest is filled from hi starting at its lowest. +#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME( \ + HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, lo, hi); \ + } +HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice) +#undef HWY_SVE_SPLICE + +} // namespace detail + +template <class D> +HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { +#if HWY_SVE_IS_POW2 + if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo); +#endif + const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi); + const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo); + return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); +} + +template <class D> +HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { +#if HWY_SVE_IS_POW2 + if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo); +#endif + const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi); + const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo); + return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); +} + +// ------------------------------ DemoteTo F + +template <size_t N, int kPow2> +HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) { + const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v); + return detail::ConcatEvenFull(in_even, + in_even); // lower half +} + +template <size_t N, int kPow2> +HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) { + const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v); + return detail::ConcatOddFull(in_even, in_even); // lower half +} + +template <size_t N, int kPow2> +HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) { + const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v); + return detail::ConcatEvenFull(in_even, + in_even); // lower half +} + +template <size_t N, int kPow2> +HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) { + const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v); + return detail::ConcatEvenFull(in_even, + in_even); // lower half +} + +// ------------------------------ ConvertTo F + +#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \ + /* signed integers */ \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } \ + /* unsigned integers */ \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } \ + /* Truncates (rounds toward zero). */ \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(int, BITS) \ + NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } + +// API only requires f32 but we provide f64 for use by Iota. +HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt) +#undef HWY_SVE_CONVERT + +// ------------------------------ NearestInt (Round, ConvertTo) +template <class VF, class DI = RebindToSigned<DFromV<VF>>> +HWY_API VFromD<DI> NearestInt(VF v) { + // No single instruction, round then truncate. + return ConvertTo(DI(), Round(v)); +} + +// ------------------------------ Iota (Add, ConvertTo) + +#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_T(BASE, BITS) first) { \ + return sv##OP##_##CHAR##BITS(first, 1); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index) +#undef HWY_SVE_IOTA + +template <class D, HWY_IF_FLOAT_D(D)> +HWY_API VFromD<D> Iota(const D d, TFromD<D> first) { + const RebindToSigned<D> di; + return detail::AddN(ConvertTo(d, Iota(di, 0)), first); +} + +// ------------------------------ InterleaveLower + +template <class D, class V> +HWY_API V InterleaveLower(D d, const V a, const V b) { + static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); +#if HWY_TARGET == HWY_SVE2_128 + (void)d; + return detail::ZipLowerSame(a, b); +#else + // Move lower halves of blocks to lower half of vector. + const Repartition<uint64_t, decltype(d)> d64; + const auto a64 = BitCast(d64, a); + const auto b64 = BitCast(d64, b); + const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half + const auto b_blocks = detail::ConcatEvenFull(b64, b64); + return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks)); +#endif +} + +template <class V> +HWY_API V InterleaveLower(const V a, const V b) { + return InterleaveLower(DFromV<V>(), a, b); +} + +// ------------------------------ InterleaveUpper + +// Only use zip2 if vector are a powers of two, otherwise getting the actual +// "upper half" requires MaskUpperHalf. +#if HWY_TARGET == HWY_SVE2_128 +namespace detail { +// Unlike Highway's ZipUpper, this returns the same type. +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2) +} // namespace detail +#endif + +// Full vector: guaranteed to have at least one block +template <class D, class V = VFromD<D>, + hwy::EnableIf<detail::IsFull(D())>* = nullptr> +HWY_API V InterleaveUpper(D d, const V a, const V b) { +#if HWY_TARGET == HWY_SVE2_128 + (void)d; + return detail::ZipUpperSame(a, b); +#else + // Move upper halves of blocks to lower half of vector. + const Repartition<uint64_t, decltype(d)> d64; + const auto a64 = BitCast(d64, a); + const auto b64 = BitCast(d64, b); + const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half + const auto b_blocks = detail::ConcatOddFull(b64, b64); + return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks)); +#endif +} + +// Capped/fraction: need runtime check +template <class D, class V = VFromD<D>, + hwy::EnableIf<!detail::IsFull(D())>* = nullptr> +HWY_API V InterleaveUpper(D d, const V a, const V b) { + // Less than one block: treat as capped + if (Lanes(d) * sizeof(TFromD<D>) < 16) { + const Half<decltype(d)> d2; + return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b)); + } + return InterleaveUpper(DFromV<V>(), a, b); +} + +// ================================================== COMBINE + +namespace detail { + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +template <class D, HWY_IF_T_SIZE_D(D, 1)> +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 32: + return svptrue_pat_b8(SV_VL16); + case 16: + return svptrue_pat_b8(SV_VL8); + case 8: + return svptrue_pat_b8(SV_VL4); + case 4: + return svptrue_pat_b8(SV_VL2); + default: + return svptrue_pat_b8(SV_VL1); + } +} +template <class D, HWY_IF_T_SIZE_D(D, 2)> +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 16: + return svptrue_pat_b16(SV_VL8); + case 8: + return svptrue_pat_b16(SV_VL4); + case 4: + return svptrue_pat_b16(SV_VL2); + default: + return svptrue_pat_b16(SV_VL1); + } +} +template <class D, HWY_IF_T_SIZE_D(D, 4)> +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 8: + return svptrue_pat_b32(SV_VL4); + case 4: + return svptrue_pat_b32(SV_VL2); + default: + return svptrue_pat_b32(SV_VL1); + } +} +template <class D, HWY_IF_T_SIZE_D(D, 8)> +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 4: + return svptrue_pat_b64(SV_VL2); + default: + return svptrue_pat_b64(SV_VL1); + } +} +#endif +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE +template <class D, HWY_IF_T_SIZE_D(D, 1)> +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 16: + return svptrue_pat_b8(SV_VL8); + case 8: + return svptrue_pat_b8(SV_VL4); + case 4: + return svptrue_pat_b8(SV_VL2); + case 2: + case 1: + default: + return svptrue_pat_b8(SV_VL1); + } +} +template <class D, HWY_IF_T_SIZE_D(D, 2)> +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 8: + return svptrue_pat_b16(SV_VL4); + case 4: + return svptrue_pat_b16(SV_VL2); + case 2: + case 1: + default: + return svptrue_pat_b16(SV_VL1); + } +} +template <class D, HWY_IF_T_SIZE_D(D, 4)> +svbool_t MaskLowerHalf(D d) { + return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1); +} +template <class D, HWY_IF_T_SIZE_D(D, 8)> +svbool_t MaskLowerHalf(D /*d*/) { + return svptrue_pat_b64(SV_VL1); +} +#endif // HWY_TARGET == HWY_SVE2_128 +#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128 +template <class D> +svbool_t MaskLowerHalf(D d) { + return FirstN(d, Lanes(d) / 2); +} +#endif + +template <class D> +svbool_t MaskUpperHalf(D d) { + // TODO(janwas): WHILEGE on SVE2 + if (HWY_SVE_IS_POW2 && IsFull(d)) { + return Not(MaskLowerHalf(d)); + } + + // For Splice to work as intended, make sure bits above Lanes(d) are zero. + return AndNot(MaskLowerHalf(d), detail::MakeMask(d)); +} + +// Right-shift vector pair by constexpr; can be used to slide down (=N) or up +// (=Lanes()-N). +#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t kIndex> \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ + return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \ + } +HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext) +#undef HWY_SVE_EXT + +} // namespace detail + +// ------------------------------ ConcatUpperLower +template <class D, class V> +HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) { + return IfThenElse(detail::MaskLowerHalf(d), lo, hi); +} + +// ------------------------------ ConcatLowerLower +template <class D, class V> +HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) { + if (detail::IsFull(d)) { +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256 + return detail::ConcatEvenBlocks(hi, lo); +#endif +#if HWY_TARGET == HWY_SVE2_128 + const Repartition<uint64_t, D> du64; + const auto lo64 = BitCast(du64, lo); + return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi))); +#endif + } + return detail::Splice(hi, lo, detail::MaskLowerHalf(d)); +} + +// ------------------------------ ConcatLowerUpper +template <class D, class V> +HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) { +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes + if (detail::IsFull(d)) { + return detail::Ext<Lanes(d) / 2>(hi, lo); + } +#endif + return detail::Splice(hi, lo, detail::MaskUpperHalf(d)); +} + +// ------------------------------ ConcatUpperUpper +template <class D, class V> +HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) { + if (detail::IsFull(d)) { +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256 + return detail::ConcatOddBlocks(hi, lo); +#endif +#if HWY_TARGET == HWY_SVE2_128 + const Repartition<uint64_t, D> du64; + const auto lo64 = BitCast(du64, lo); + return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi))); +#endif + } + const svbool_t mask_upper = detail::MaskUpperHalf(d); + const V lo_upper = detail::Splice(lo, lo, mask_upper); + return IfThenElse(mask_upper, hi, lo_upper); +} + +// ------------------------------ Combine +template <class D, class V2> +HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) { + return ConcatLowerLower(d, hi, lo); +} + +// ------------------------------ ZeroExtendVector +template <class D, class V> +HWY_API V ZeroExtendVector(const D d, const V lo) { + return Combine(d, Zero(Half<D>()), lo); +} + +// ------------------------------ Lower/UpperHalf + +template <class D2, class V> +HWY_API V LowerHalf(D2 /* tag */, const V v) { + return v; +} + +template <class V> +HWY_API V LowerHalf(const V v) { + return v; +} + +template <class DH, class V> +HWY_API V UpperHalf(const DH dh, const V v) { + const Twice<decltype(dh)> d; + // Cast so that we support bfloat16_t. + const RebindToUnsigned<decltype(d)> du; + const VFromD<decltype(du)> vu = BitCast(du, v); +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes + return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu)); +#else + const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du); + return BitCast(d, detail::Splice(vu, vu, mask)); +#endif +} + +// ================================================== REDUCE + +// These return T, whereas the Highway op returns a broadcasted vector. +namespace detail { +#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ + /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \ + using T = HWY_SVE_T(BASE, BITS); \ + using TU = MakeUnsigned<T>; \ + constexpr uint64_t kMask = LimitsMax<TU>(); \ + return static_cast<T>(static_cast<TU>( \ + static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \ + } + +#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(pg, v); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv) +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv) + +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv) +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv) +// NaN if all are +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv) +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv) + +#undef HWY_SVE_REDUCE +#undef HWY_SVE_REDUCE_ADD +} // namespace detail + +template <class D, class V> +V SumOfLanes(D d, V v) { + return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v)); +} + +template <class D, class V> +TFromV<V> ReduceSum(D d, V v) { + return detail::SumOfLanesM(detail::MakeMask(d), v); +} + +template <class D, class V> +V MinOfLanes(D d, V v) { + return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v)); +} + +template <class D, class V> +V MaxOfLanes(D d, V v) { + return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v)); +} + +// ================================================== SWIZZLE + +// ------------------------------ GetLane + +namespace detail { +#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE HWY_SVE_T(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta) +HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb) +#undef HWY_SVE_GET_LANE +} // namespace detail + +template <class V> +HWY_API TFromV<V> GetLane(V v) { + return detail::GetLaneM(v, detail::PFalse()); +} + +// ------------------------------ ExtractLane +template <class V> +HWY_API TFromV<V> ExtractLane(V v, size_t i) { + return detail::GetLaneM(v, FirstN(DFromV<V>(), i)); +} + +// ------------------------------ InsertLane (IfThenElse) +template <class V> +HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) { + const DFromV<V> d; + const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i)); + return IfThenElse(RebindMask(d, is_i), Set(d, t), v); +} + +// ------------------------------ DupEven + +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1) +} // namespace detail + +template <class V> +HWY_API V DupEven(const V v) { + return detail::InterleaveEven(v, v); +} + +// ------------------------------ DupOdd + +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2) +} // namespace detail + +template <class V> +HWY_API V DupOdd(const V v) { + return detail::InterleaveOdd(v, v); +} + +// ------------------------------ OddEven + +#if HWY_SVE_HAVE_2 + +#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \ + return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_ODD_EVEN, OddEven, eortb_n) +#undef HWY_SVE_ODD_EVEN + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V OddEven(const V odd, const V even) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even))); +} + +#else + +template <class V> +HWY_API V OddEven(const V odd, const V even) { + const auto odd_in_even = detail::Ext<1>(odd, odd); + return detail::InterleaveEven(even, odd_in_even); +} + +#endif // HWY_TARGET + +// ------------------------------ OddEvenBlocks +template <class V> +HWY_API V OddEvenBlocks(const V odd, const V even) { + const DFromV<V> d; +#if HWY_TARGET == HWY_SVE_256 + return ConcatUpperLower(d, odd, even); +#elif HWY_TARGET == HWY_SVE2_128 + (void)odd; + (void)d; + return even; +#else + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + constexpr size_t kShift = CeilLog2(16 / sizeof(TU)); + const auto idx_block = ShiftRight<kShift>(Iota(du, 0)); + const auto lsb = detail::AndN(idx_block, static_cast<TU>(1)); + const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0)); + return IfThenElse(is_even, even, odd); +#endif +} + +// ------------------------------ TableLookupLanes + +template <class D, class VI> +HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) { + using TI = TFromV<VI>; + static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch"); + const RebindToUnsigned<D> du; + const auto indices = BitCast(du, vec); +#if HWY_IS_DEBUG_BUILD + using TU = MakeUnsigned<TI>; + const size_t twice_max_lanes = Lanes(d) * 2; + HWY_DASSERT(AllTrue( + du, + detail::Eq(indices, + detail::AndN(indices, static_cast<TU>(twice_max_lanes - 1))))); +#else + (void)d; +#endif + return indices; +} + +template <class D, typename TI> +HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) { + static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); + return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx)); +} + +#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \ + return sv##OP##_##CHAR##BITS(v, idx); \ + } + +HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl) +#undef HWY_SVE_TABLE + +#if HWY_SVE_HAVE_2 +namespace detail { +#define HWY_SVE_TABLE2(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(uint, BITS) idx) { \ + return sv##OP##_##CHAR##BITS(tuple, idx); \ + } + +HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2) +#undef HWY_SVE_TABLE +} // namespace detail +#endif // HWY_SVE_HAVE_2 + +template <class D> +HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, + VFromD<RebindToUnsigned<D>> idx) { + // SVE2 has an instruction for this, but it only works for full 2^n vectors. +#if HWY_SVE_HAVE_2 && HWY_SVE_IS_POW2 + if (detail::IsFull(d)) { + return detail::NativeTwoTableLookupLanes(Create2(d, a, b), idx); + } +#endif + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + + const size_t num_of_lanes = Lanes(d); + const auto idx_mod = detail::AndN(idx, static_cast<TU>(num_of_lanes - 1)); + const auto sel_a_mask = Eq(idx, idx_mod); + + const auto a_lookup_result = TableLookupLanes(a, idx_mod); + const auto b_lookup_result = TableLookupLanes(b, idx_mod); + return IfThenElse(sel_a_mask, a_lookup_result, b_lookup_result); +} + +template <class V> +HWY_API V TwoTablesLookupLanes(V a, V b, + VFromD<RebindToUnsigned<DFromV<V>>> idx) { + const DFromV<decltype(a)> d; + return TwoTablesLookupLanes(d, a, b, idx); +} + +// ------------------------------ SwapAdjacentBlocks (TableLookupLanes) + +namespace detail { + +template <typename T, size_t N, int kPow2> +constexpr size_t LanesPerBlock(Simd<T, N, kPow2> d) { + // We might have a capped vector smaller than a block, so honor that. + return HWY_MIN(16 / sizeof(T), MaxLanes(d)); +} + +} // namespace detail + +template <class V> +HWY_API V SwapAdjacentBlocks(const V v) { + const DFromV<V> d; +#if HWY_TARGET == HWY_SVE_256 + return ConcatLowerUpper(d, v, v); +#elif HWY_TARGET == HWY_SVE2_128 + (void)d; + return v; +#else + const RebindToUnsigned<decltype(d)> du; + constexpr auto kLanesPerBlock = + static_cast<TFromD<decltype(du)>>(detail::LanesPerBlock(d)); + const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock); + return TableLookupLanes(v, idx); +#endif +} + +// ------------------------------ Reverse + +namespace detail { + +#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev) +#undef HWY_SVE_REVERSE + +} // namespace detail + +template <class D, class V> +HWY_API V Reverse(D d, V v) { + using T = TFromD<D>; + const auto reversed = detail::ReverseFull(v); + if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed; + // Shift right to remove extra (non-pow2 and remainder) lanes. + // TODO(janwas): on SVE2, use WHILEGE. + // Avoids FirstN truncating to the return vector size. Must also avoid Not + // because that is limited to SV_POW2. + const ScalableTag<T> dfull; + const svbool_t all_true = detail::AllPTrue(dfull); + const size_t all_lanes = detail::AllHardwareLanes<T>(); + const size_t want_lanes = Lanes(d); + HWY_DASSERT(want_lanes <= all_lanes); + const svbool_t mask = + svnot_b_z(all_true, FirstN(dfull, all_lanes - want_lanes)); + return detail::Splice(reversed, reversed, mask); +} + +// ------------------------------ Reverse2 + +// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<decltype(du)> dw; + return BitCast(d, svrevb_u16_x(detail::PTrue(d), BitCast(dw, v))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<decltype(du)> dw; + return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<decltype(du)> dw; + return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210 +#if HWY_TARGET == HWY_SVE2_128 + if (detail::IsFull(d)) { + return detail::Ext<1>(v, v); + } +#endif + (void)d; + const auto odd_in_even = detail::Ext<1>(v, v); // x321 + return detail::InterleaveEven(odd_in_even, v); // 2301 +} + +// ------------------------------ Reverse4 (TableLookupLanes) + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<RepartitionToWide<decltype(du)>> du32; + return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<RepartitionToWide<decltype(du)>> du64; + return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { + if (HWY_TARGET == HWY_SVE2_128 && detail::IsFull(d)) { + return detail::ReverseFull(v); + } + // TODO(janwas): is this approach faster than Shuffle0123? + const RebindToUnsigned<decltype(d)> du; + const auto idx = detail::XorN(Iota(du, 0), 3); + return TableLookupLanes(v, idx); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { + if (HWY_TARGET == HWY_SVE_256 && detail::IsFull(d)) { + return detail::ReverseFull(v); + } + // TODO(janwas): is this approach faster than Shuffle0123? + const RebindToUnsigned<decltype(d)> du; + const auto idx = detail::XorN(Iota(du, 0), 3); + return TableLookupLanes(v, idx); +} + +// ------------------------------ Reverse8 (TableLookupLanes) + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { + const Repartition<uint64_t, decltype(d)> du64; + return BitCast(d, svrevb_u64_x(detail::PTrue(d), BitCast(du64, v))); +} + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { + const RebindToUnsigned<decltype(d)> du; + const auto idx = detail::XorN(Iota(du, 0), 7); + return TableLookupLanes(v, idx); +} + +// ------------------------------- ReverseBits + +#ifdef HWY_NATIVE_REVERSE_BITS_UI8 +#undef HWY_NATIVE_REVERSE_BITS_UI8 +#else +#define HWY_NATIVE_REVERSE_BITS_UI8 +#endif + +#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#else +#define HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#endif + +#define HWY_SVE_REVERSE_BITS(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + const DFromV<decltype(v)> d; \ + return sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_REVERSE_BITS, ReverseBits, rbit) +#undef HWY_SVE_REVERSE_BITS + +// ------------------------------ Compress (PromoteTo) + +template <typename T> +struct CompressIsPartition { +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 + // Optimization for 64-bit lanes (could also be applied to 32-bit, but that + // requires a larger table). + enum { value = (sizeof(T) == 8) }; +#else + enum { value = 0 }; +#endif // HWY_TARGET == HWY_SVE_256 +}; + +#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, v); \ + } + +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 +HWY_SVE_FOREACH_UI32(HWY_SVE_COMPRESS, Compress, compact) +HWY_SVE_FOREACH_F32(HWY_SVE_COMPRESS, Compress, compact) +#else +HWY_SVE_FOREACH_UIF3264(HWY_SVE_COMPRESS, Compress, compact) +#endif +#undef HWY_SVE_COMPRESS + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_API V Compress(V v, svbool_t mask) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du64; + + // Convert mask into bitfield via horizontal sum (faster than ORV) of masked + // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for + // SetTableIndices. + const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + // See CompressIsPartition. + alignas(16) static constexpr uint64_t table[4 * 16] = { + // PrintCompress64x4Tables + 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2, + 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2, + 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3}; + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +} + +#endif // HWY_TARGET == HWY_SVE_256 +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_API V Compress(V v, svbool_t mask) { + // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10 + // swaps upper/lower (the lower half is set to the upper half, and the + // remaining upper half is filled from the lower half of the second v), and + // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10 + // unchanged and map everything else to 00. + const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane + return detail::Splice(v, v, AndNot(maskLL, mask)); +} + +#endif // HWY_TARGET == HWY_SVE2_128 + +template <class V, HWY_IF_T_SIZE_V(V, 2)> +HWY_API V Compress(V v, svbool_t mask16) { + static_assert(!IsSame<V, svfloat16_t>(), "Must use overload"); + const DFromV<V> d16; + + // Promote vector and mask to 32-bit + const RepartitionToWide<decltype(d16)> dw; + const auto v32L = PromoteTo(dw, v); + const auto v32H = detail::PromoteUpperTo(dw, v); + const svbool_t mask32L = svunpklo_b(mask16); + const svbool_t mask32H = svunpkhi_b(mask16); + + const auto compressedL = Compress(v32L, mask32L); + const auto compressedH = Compress(v32H, mask32H); + + // Demote to 16-bit (already in range) - separately so we can splice + const V evenL = BitCast(d16, compressedL); + const V evenH = BitCast(d16, compressedH); + const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half + const V v16H = detail::ConcatEvenFull(evenH, evenH); + + // We need to combine two vectors of non-constexpr length, so the only option + // is Splice, which requires us to synthesize a mask. NOTE: this function uses + // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt. + const size_t countL = detail::CountTrueFull(dw, mask32L); + const auto compressed_maskL = FirstN(d16, countL); + return detail::Splice(v16H, v16L, compressed_maskL); +} + +// Must treat float16_t as integers so we can ConcatEven. +HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) { + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + return BitCast(df, Compress(BitCast(di, v), mask16)); +} + +// ------------------------------ CompressNot + +// 2 or 4 bytes +template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4))> +HWY_API V CompressNot(V v, const svbool_t mask) { + return Compress(v, Not(mask)); +} + +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_API V CompressNot(V v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE + // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10 + // swaps upper/lower (the lower half is set to the upper half, and the + // remaining upper half is filled from the lower half of the second v), and + // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map + // 01 to 10, and everything else to 00. + const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane + return detail::Splice(v, v, AndNot(mask, maskLL)); +#endif +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du64; + + // Convert mask into bitfield via horizontal sum (faster than ORV) of masked + // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for + // SetTableIndices. + const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + // See CompressIsPartition. + alignas(16) static constexpr uint64_t table[4 * 16] = { + // PrintCompressNot64x4Tables + 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3, + 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3, + 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +#endif // HWY_TARGET == HWY_SVE_256 + + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE2_128 + (void)mask; + return v; +#endif +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE + uint64_t bits = 0; // predicate reg is 32-bit + CopyBytes<4>(&mask, &bits); // not same size - 64-bit more efficient + // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx. + const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u); + // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1]. + alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1, + 0, 1, 2, 3, 0, 1, 2, 3}; + const ScalableTag<uint64_t> d; + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +#endif + + return CompressNot(v, mask); +} + +// ------------------------------ CompressStore +template <class V, class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d, + TFromD<D>* HWY_RESTRICT unaligned) { + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ CompressBlendedStore +template <class V, class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const size_t count = CountTrue(d, mask); + const svbool_t store_mask = FirstN(d, count); + BlendedStore(Compress(v, mask), store_mask, d, unaligned); + return count; +} + +// ================================================== MASK (2) + +// ------------------------------ FindKnownLastTrue +template <class D> +HWY_API size_t FindKnownLastTrue(D d, svbool_t m) { + const RebindToUnsigned<decltype(d)> du; + return static_cast<size_t>(detail::ExtractLastMatchingLaneM( + Iota(du, 0), And(m, detail::MakeMask(d)))); +} + +// ------------------------------ FindLastTrue +template <class D> +HWY_API intptr_t FindLastTrue(D d, svbool_t m) { + return AllFalse(d, m) ? intptr_t{-1} + : static_cast<intptr_t>(FindKnownLastTrue(d, m)); +} + +// ================================================== BLOCKWISE + +// ------------------------------ CombineShiftRightBytes + +// Prevent accidentally using these for 128-bit vectors - should not be +// necessary. +#if HWY_TARGET != HWY_SVE2_128 +namespace detail { + +// For x86-compatible behaviour mandated by Highway API: TableLookupBytes +// offsets are implicitly relative to the start of their 128-bit block. +template <class D, class V> +HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { + using T = MakeUnsigned<TFromD<D>>; + return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0); +} + +template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 1)> +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned<decltype(d)> du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint8_t idx_mod = + svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, + 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock, + 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock, + 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock, + 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock, + 15 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} +template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 2)> +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned<decltype(d)> du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint16_t idx_mod = + svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, + 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock, + 6 % kLanesPerBlock, 7 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} +template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 4)> +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned<decltype(d)> du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint32_t idx_mod = + svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, + 3 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} +template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 8)> +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned<decltype(d)> du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint64_t idx_mod = + svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} + +} // namespace detail +#endif // HWY_TARGET != HWY_SVE2_128 + +template <size_t kBytes, class D, class V = VFromD<D>> +HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) { + const Repartition<uint8_t, decltype(d)> d8; + const auto hi8 = BitCast(d8, hi); + const auto lo8 = BitCast(d8, lo); +#if HWY_TARGET == HWY_SVE2_128 + return BitCast(d, detail::Ext<kBytes>(hi8, lo8)); +#else + const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes)); + const auto lo_down = detail::Ext<kBytes>(lo8, lo8); + const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); + return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); +#endif +} + +// ------------------------------ Shuffle2301 +template <class V> +HWY_API V Shuffle2301(const V v) { + const DFromV<V> d; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + return Reverse2(d, v); +} + +// ------------------------------ Shuffle2103 +template <class V> +HWY_API V Shuffle2103(const V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> d8; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8)); +} + +// ------------------------------ Shuffle0321 +template <class V> +HWY_API V Shuffle0321(const V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> d8; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8)); +} + +// ------------------------------ Shuffle1032 +template <class V> +HWY_API V Shuffle1032(const V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> d8; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8)); +} + +// ------------------------------ Shuffle01 +template <class V> +HWY_API V Shuffle01(const V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> d8; + static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8)); +} + +// ------------------------------ Shuffle0123 +template <class V> +HWY_API V Shuffle0123(const V v) { + return Shuffle2301(Shuffle1032(v)); +} + +// ------------------------------ ReverseBlocks (Reverse, Shuffle01) +template <class D, class V = VFromD<D>> +HWY_API V ReverseBlocks(D d, V v) { +#if HWY_TARGET == HWY_SVE_256 + if (detail::IsFull(d)) { + return SwapAdjacentBlocks(v); + } else if (detail::IsFull(Twice<D>())) { + return v; + } +#elif HWY_TARGET == HWY_SVE2_128 + (void)d; + return v; +#endif + const Repartition<uint64_t, D> du64; + return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v)))); +} + +// ------------------------------ TableLookupBytes + +template <class V, class VI> +HWY_API VI TableLookupBytes(const V v, const VI idx) { + const DFromV<VI> d; + const Repartition<uint8_t, decltype(d)> du8; +#if HWY_TARGET == HWY_SVE2_128 + return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx))); +#else + const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0)); + const auto idx8 = Add(BitCast(du8, idx), offsets128); + return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8)); +#endif +} + +template <class V, class VI> +HWY_API VI TableLookupBytesOr0(const V v, const VI idx) { + const DFromV<VI> d; + // Mask size must match vector type, so cast everything to this type. + const Repartition<int8_t, decltype(d)> di8; + + auto idx8 = BitCast(di8, idx); + const auto msb = detail::LtN(idx8, 0); + + const auto lookup = TableLookupBytes(BitCast(di8, v), idx8); + return BitCast(d, IfThenZeroElse(msb, lookup)); +} + +// ------------------------------ Broadcast + +#if HWY_TARGET == HWY_SVE2_128 +namespace detail { +#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <int kLane> \ + HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(v, kLane); \ + } + +HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane) +#undef HWY_SVE_BROADCAST +} // namespace detail +#endif + +template <int kLane, class V> +HWY_API V Broadcast(const V v) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane"); +#if HWY_TARGET == HWY_SVE2_128 + return detail::BroadcastLane<kLane>(v); +#else + auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0)); + if (kLane != 0) { + idx = detail::AddN(idx, kLane); + } + return TableLookupLanes(v, idx); +#endif +} + +// ------------------------------ ShiftLeftLanes + +template <size_t kLanes, class D, class V = VFromD<D>> +HWY_API V ShiftLeftLanes(D d, const V v) { + const auto zero = Zero(d); + const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes)); +#if HWY_TARGET == HWY_SVE2_128 + return shifted; +#else + // Match x86 semantics by zeroing lower lanes in 128-bit blocks + return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted); +#endif +} + +template <size_t kLanes, class V> +HWY_API V ShiftLeftLanes(const V v) { + return ShiftLeftLanes<kLanes>(DFromV<V>(), v); +} + +// ------------------------------ ShiftRightLanes +template <size_t kLanes, class D, class V = VFromD<D>> +HWY_API V ShiftRightLanes(D d, V v) { + // For capped/fractional vectors, clear upper lanes so we shift in zeros. + if (!detail::IsFull(d)) { + v = IfThenElseZero(detail::MakeMask(d), v); + } + +#if HWY_TARGET == HWY_SVE2_128 + return detail::Ext<kLanes>(Zero(d), v); +#else + const auto shifted = detail::Ext<kLanes>(v, v); + // Match x86 semantics by zeroing upper lanes in 128-bit blocks + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d); + const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d); + return IfThenElseZero(mask, shifted); +#endif +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D, class V = VFromD<D>> +HWY_API V ShiftLeftBytes(const D d, const V v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v))); +} + +template <int kBytes, class V> +HWY_API V ShiftLeftBytes(const V v) { + return ShiftLeftBytes<kBytes>(DFromV<V>(), v); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D, class V = VFromD<D>> +HWY_API V ShiftRightBytes(const D d, const V v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v))); +} + +// ------------------------------ ZipLower + +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + const RepartitionToNarrow<DW> dn; + static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch"); + return BitCast(dw, InterleaveLower(dn, a, b)); +} +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipLower(const V a, const V b) { + return BitCast(DW(), InterleaveLower(D(), a, b)); +} + +// ------------------------------ ZipUpper +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + const RepartitionToNarrow<DW> dn; + static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch"); + return BitCast(dw, InterleaveUpper(dn, a, b)); +} + +// ================================================== Ops with dependencies + +// ------------------------------ PromoteTo bfloat16 (ZipLower) +template <size_t N, int kPow2> +HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, + const svuint16_t v) { + return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v)); +} + +// ------------------------------ ReorderDemote2To (OddEven) + +template <size_t N, int kPow2> +HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, + svfloat32_t a, svfloat32_t b) { + const RebindToUnsigned<decltype(dbf16)> du16; + const Repartition<uint32_t, decltype(dbf16)> du32; + const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +template <size_t N, int kPow2> +HWY_API svint16_t ReorderDemote2To(Simd<int16_t, N, kPow2> d16, svint32_t a, + svint32_t b) { +#if HWY_SVE_HAVE_2 + (void)d16; + const svint16_t a_in_even = svqxtnb_s32(a); + return svqxtnt_s32(a_in_even, b); +#else + const svint16_t a16 = BitCast(d16, detail::SaturateI<int16_t>(a)); + const svint16_t b16 = BitCast(d16, detail::SaturateI<int16_t>(b)); + return detail::InterleaveEven(a16, b16); +#endif +} + +template <size_t N, int kPow2> +HWY_API svuint16_t ReorderDemote2To(Simd<uint16_t, N, kPow2> d16, svint32_t a, + svint32_t b) { +#if HWY_SVE_HAVE_2 + (void)d16; + const svuint16_t a_in_even = svqxtunb_s32(a); + return svqxtunt_s32(a_in_even, b); +#else + const Repartition<uint32_t, decltype(d16)> du32; + const svuint32_t clamped_a = BitCast(du32, detail::MaxN(a, 0)); + const svuint32_t clamped_b = BitCast(du32, detail::MaxN(b, 0)); + const svuint16_t a16 = BitCast(d16, detail::SaturateU<uint16_t>(clamped_a)); + const svuint16_t b16 = BitCast(d16, detail::SaturateU<uint16_t>(clamped_b)); + return detail::InterleaveEven(a16, b16); +#endif +} + +template <size_t N, int kPow2> +HWY_API svuint16_t ReorderDemote2To(Simd<uint16_t, N, kPow2> d16, svuint32_t a, + svuint32_t b) { +#if HWY_SVE_HAVE_2 + (void)d16; + const svuint16_t a_in_even = svqxtnb_u32(a); + return svqxtnt_u32(a_in_even, b); +#else + const svuint16_t a16 = BitCast(d16, detail::SaturateU<uint16_t>(a)); + const svuint16_t b16 = BitCast(d16, detail::SaturateU<uint16_t>(b)); + return detail::InterleaveEven(a16, b16); +#endif +} + +template <size_t N, int kPow2> +HWY_API svint8_t ReorderDemote2To(Simd<int8_t, N, kPow2> d8, svint16_t a, + svint16_t b) { +#if HWY_SVE_HAVE_2 + (void)d8; + const svint8_t a_in_even = svqxtnb_s16(a); + return svqxtnt_s16(a_in_even, b); +#else + const svint8_t a8 = BitCast(d8, detail::SaturateI<int8_t>(a)); + const svint8_t b8 = BitCast(d8, detail::SaturateI<int8_t>(b)); + return detail::InterleaveEven(a8, b8); +#endif +} + +template <size_t N, int kPow2> +HWY_API svuint8_t ReorderDemote2To(Simd<uint8_t, N, kPow2> d8, svint16_t a, + svint16_t b) { +#if HWY_SVE_HAVE_2 + (void)d8; + const svuint8_t a_in_even = svqxtunb_s16(a); + return svqxtunt_s16(a_in_even, b); +#else + const Repartition<uint16_t, decltype(d8)> du16; + const svuint16_t clamped_a = BitCast(du16, detail::MaxN(a, 0)); + const svuint16_t clamped_b = BitCast(du16, detail::MaxN(b, 0)); + const svuint8_t a8 = BitCast(d8, detail::SaturateU<uint8_t>(clamped_a)); + const svuint8_t b8 = BitCast(d8, detail::SaturateU<uint8_t>(clamped_b)); + return detail::InterleaveEven(a8, b8); +#endif +} + +template <size_t N, int kPow2> +HWY_API svuint8_t ReorderDemote2To(Simd<uint8_t, N, kPow2> d8, svuint16_t a, + svuint16_t b) { +#if HWY_SVE_HAVE_2 + (void)d8; + const svuint8_t a_in_even = svqxtnb_u16(a); + return svqxtnt_u16(a_in_even, b); +#else + const svuint8_t a8 = BitCast(d8, detail::SaturateU<uint8_t>(a)); + const svuint8_t b8 = BitCast(d8, detail::SaturateU<uint8_t>(b)); + return detail::InterleaveEven(a8, b8); +#endif +} + +template <size_t N, int kPow2> +HWY_API svint32_t ReorderDemote2To(Simd<int32_t, N, kPow2> d32, svint64_t a, + svint64_t b) { +#if HWY_SVE_HAVE_2 + (void)d32; + const svint32_t a_in_even = svqxtnb_s64(a); + return svqxtnt_s64(a_in_even, b); +#else + const svint32_t a32 = BitCast(d32, detail::SaturateI<int32_t>(a)); + const svint32_t b32 = BitCast(d32, detail::SaturateI<int32_t>(b)); + return detail::InterleaveEven(a32, b32); +#endif +} + +template <size_t N, int kPow2> +HWY_API svuint32_t ReorderDemote2To(Simd<uint32_t, N, kPow2> d32, svint64_t a, + svint64_t b) { +#if HWY_SVE_HAVE_2 + (void)d32; + const svuint32_t a_in_even = svqxtunb_s64(a); + return svqxtunt_s64(a_in_even, b); +#else + const Repartition<uint64_t, decltype(d32)> du64; + const svuint64_t clamped_a = BitCast(du64, detail::MaxN(a, 0)); + const svuint64_t clamped_b = BitCast(du64, detail::MaxN(b, 0)); + const svuint32_t a32 = BitCast(d32, detail::SaturateU<uint32_t>(clamped_a)); + const svuint32_t b32 = BitCast(d32, detail::SaturateU<uint32_t>(clamped_b)); + return detail::InterleaveEven(a32, b32); +#endif +} + +template <size_t N, int kPow2> +HWY_API svuint32_t ReorderDemote2To(Simd<uint32_t, N, kPow2> d32, svuint64_t a, + svuint64_t b) { +#if HWY_SVE_HAVE_2 + (void)d32; + const svuint32_t a_in_even = svqxtnb_u64(a); + return svqxtnt_u64(a_in_even, b); +#else + const svuint32_t a32 = BitCast(d32, detail::SaturateU<uint32_t>(a)); + const svuint32_t b32 = BitCast(d32, detail::SaturateU<uint32_t>(b)); + return detail::InterleaveEven(a32, b32); +#endif +} + +template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)> +HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) { + const Half<decltype(dn)> dnh; + const auto demoted_a = DemoteTo(dnh, a); + const auto demoted_b = DemoteTo(dnh, b); + return Combine(dn, demoted_b, demoted_a); +} + +template <class D, HWY_IF_BF16_D(D)> +HWY_API svuint16_t OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) { + const Half<decltype(dn)> dnh; + const RebindToUnsigned<decltype(dn)> dn_u; + const RebindToUnsigned<decltype(dnh)> dnh_u; + const auto demoted_a = DemoteTo(dnh, a); + const auto demoted_b = DemoteTo(dnh, b); + return Combine(dn_u, BitCast(dnh_u, demoted_b), BitCast(dnh_u, demoted_a)); +} + +// ------------------------------ ZeroIfNegative (Lt, IfThenElse) +template <class V> +HWY_API V ZeroIfNegative(const V v) { + return IfThenZeroElse(detail::LtN(v, 0), v); +} + +// ------------------------------ BroadcastSignBit (ShiftRight) +template <class V> +HWY_API V BroadcastSignBit(const V v) { + return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v); +} + +// ------------------------------ IfNegativeThenElse (BroadcastSignBit) +template <class V> +HWY_API V IfNegativeThenElse(V v, V yes, V no) { + static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float"); + const DFromV<V> d; + const RebindToSigned<decltype(d)> di; + + const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(m, yes, no); +} + +// ------------------------------ AverageRound (ShiftRight) + +#if HWY_SVE_HAVE_2 +HWY_SVE_FOREACH_U08(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd) +HWY_SVE_FOREACH_U16(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd) +#else +template <class V> +V AverageRound(const V a, const V b) { + return ShiftRight<1>(detail::AddN(Add(a, b), 1)); +} +#endif // HWY_SVE_HAVE_2 + +// ------------------------------ LoadMaskBits (TestBit) + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned<D> du; + const svuint8_t iota = Iota(du, 0); + + // Load correct number of bytes (bits/8) with 7 zeros after each. + const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits)); + // Replicate bytes 8x such that each byte contains the bit that governs it. + const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota)); + + const svuint8_t bit = + svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + return TestBit(rep8, bit); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE svbool_t LoadMaskBits(D /* tag */, + const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned<D> du; + const Repartition<uint8_t, D> du8; + + // There may be up to 128 bits; avoid reading past the end. + const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits); + + // Replicate bytes 16x such that each lane contains the bit that governs it. + const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0))); + + const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128); + return TestBit(BitCast(du, rep16), bit); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE svbool_t LoadMaskBits(D /* tag */, + const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned<D> du; + const Repartition<uint8_t, D> du8; + + // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable, + // so we can skip computing the actual length (Lanes(du)+7)/8. + const svuint8_t bytes = svld1(FirstN(du8, 8), bits); + + // Replicate bytes 32x such that each lane contains the bit that governs it. + const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0))); + + // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 .. + const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7)); + + return TestBit(BitCast(du, rep32), bit); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE svbool_t LoadMaskBits(D /* tag */, + const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned<D> du; + + // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane. + // The "at least 8 byte" guarantee in quick_reference ensures this is safe. + uint32_t mask_bits; + CopyBytes<4>(bits, &mask_bits); // copy from bytes + const auto vbits = Set(du, mask_bits); + + // 2 ^ {0,1, .., 31}, will not have more lanes than that. + const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0)); + + return TestBit(vbits, bit); +} + +// ------------------------------ StoreMaskBits + +namespace detail { + +// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes. +template <class T, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + return svdup_n_u8_z(m, 1); +} +template <class T, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + const ScalableTag<uint8_t> d8; + const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1)); + return detail::ConcatEvenFull(b16, b16); // lower half +} +template <class T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + return U8FromU32(svdup_n_u32_z(m, 1)); +} +template <class T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + const ScalableTag<uint32_t> d32; + const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1)); + return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half +} + +// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane. +HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) { + const ScalableTag<uint8_t> d8; + const ScalableTag<uint16_t> d16; + const ScalableTag<uint32_t> d32; + const ScalableTag<uint64_t> d64; + // TODO(janwas): could use SVE2 BDEP, but it's optional. + x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x)))); + x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x)))); + x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x)))); + return BitCast(d64, x); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +// TODO(janwas): specialize for HWY_SVE_256 +template <class D> +HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) { + svuint64_t bits_in_u64 = + detail::BitsFromBool(detail::BoolFromMask<TFromD<D>>(m)); + + const size_t num_bits = Lanes(d); + const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below + + // Truncate each u64 to 8 bits and store to u8. + svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64); + + // Non-full byte, need to clear the undefined upper bits. Can happen for + // capped/fractional vectors or large T and small hardware vectors. + if (num_bits < 8) { + const int mask = static_cast<int>((1ull << num_bits) - 1); + bits[0] = static_cast<uint8_t>(bits[0] & mask); + } + // Else: we wrote full bytes because num_bits is a power of two >= 8. + + return num_bytes; +} + +// ------------------------------ CompressBits (LoadMaskBits) +template <class V, HWY_IF_NOT_T_SIZE_V(V, 1)> +HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(DFromV<V>(), bits)); +} + +// ------------------------------ CompressBitsStore (LoadMaskBits) +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// ------------------------------ Expand (StoreMaskBits) + +#ifdef HWY_NATIVE_EXPAND +#undef HWY_NATIVE_EXPAND +#else +#define HWY_NATIVE_EXPAND +#endif + +namespace detail { + +HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits) { + const CappedTag<uint8_t, 8> du8; + alignas(16) static constexpr uint8_t table[8 * 256] = { + // PrintExpand8x8Tables + 128, 128, 128, 128, 128, 128, 128, 128, // + 0, 128, 128, 128, 128, 128, 128, 128, // + 128, 0, 128, 128, 128, 128, 128, 128, // + 0, 1, 128, 128, 128, 128, 128, 128, // + 128, 128, 0, 128, 128, 128, 128, 128, // + 0, 128, 1, 128, 128, 128, 128, 128, // + 128, 0, 1, 128, 128, 128, 128, 128, // + 0, 1, 2, 128, 128, 128, 128, 128, // + 128, 128, 128, 0, 128, 128, 128, 128, // + 0, 128, 128, 1, 128, 128, 128, 128, // + 128, 0, 128, 1, 128, 128, 128, 128, // + 0, 1, 128, 2, 128, 128, 128, 128, // + 128, 128, 0, 1, 128, 128, 128, 128, // + 0, 128, 1, 2, 128, 128, 128, 128, // + 128, 0, 1, 2, 128, 128, 128, 128, // + 0, 1, 2, 3, 128, 128, 128, 128, // + 128, 128, 128, 128, 0, 128, 128, 128, // + 0, 128, 128, 128, 1, 128, 128, 128, // + 128, 0, 128, 128, 1, 128, 128, 128, // + 0, 1, 128, 128, 2, 128, 128, 128, // + 128, 128, 0, 128, 1, 128, 128, 128, // + 0, 128, 1, 128, 2, 128, 128, 128, // + 128, 0, 1, 128, 2, 128, 128, 128, // + 0, 1, 2, 128, 3, 128, 128, 128, // + 128, 128, 128, 0, 1, 128, 128, 128, // + 0, 128, 128, 1, 2, 128, 128, 128, // + 128, 0, 128, 1, 2, 128, 128, 128, // + 0, 1, 128, 2, 3, 128, 128, 128, // + 128, 128, 0, 1, 2, 128, 128, 128, // + 0, 128, 1, 2, 3, 128, 128, 128, // + 128, 0, 1, 2, 3, 128, 128, 128, // + 0, 1, 2, 3, 4, 128, 128, 128, // + 128, 128, 128, 128, 128, 0, 128, 128, // + 0, 128, 128, 128, 128, 1, 128, 128, // + 128, 0, 128, 128, 128, 1, 128, 128, // + 0, 1, 128, 128, 128, 2, 128, 128, // + 128, 128, 0, 128, 128, 1, 128, 128, // + 0, 128, 1, 128, 128, 2, 128, 128, // + 128, 0, 1, 128, 128, 2, 128, 128, // + 0, 1, 2, 128, 128, 3, 128, 128, // + 128, 128, 128, 0, 128, 1, 128, 128, // + 0, 128, 128, 1, 128, 2, 128, 128, // + 128, 0, 128, 1, 128, 2, 128, 128, // + 0, 1, 128, 2, 128, 3, 128, 128, // + 128, 128, 0, 1, 128, 2, 128, 128, // + 0, 128, 1, 2, 128, 3, 128, 128, // + 128, 0, 1, 2, 128, 3, 128, 128, // + 0, 1, 2, 3, 128, 4, 128, 128, // + 128, 128, 128, 128, 0, 1, 128, 128, // + 0, 128, 128, 128, 1, 2, 128, 128, // + 128, 0, 128, 128, 1, 2, 128, 128, // + 0, 1, 128, 128, 2, 3, 128, 128, // + 128, 128, 0, 128, 1, 2, 128, 128, // + 0, 128, 1, 128, 2, 3, 128, 128, // + 128, 0, 1, 128, 2, 3, 128, 128, // + 0, 1, 2, 128, 3, 4, 128, 128, // + 128, 128, 128, 0, 1, 2, 128, 128, // + 0, 128, 128, 1, 2, 3, 128, 128, // + 128, 0, 128, 1, 2, 3, 128, 128, // + 0, 1, 128, 2, 3, 4, 128, 128, // + 128, 128, 0, 1, 2, 3, 128, 128, // + 0, 128, 1, 2, 3, 4, 128, 128, // + 128, 0, 1, 2, 3, 4, 128, 128, // + 0, 1, 2, 3, 4, 5, 128, 128, // + 128, 128, 128, 128, 128, 128, 0, 128, // + 0, 128, 128, 128, 128, 128, 1, 128, // + 128, 0, 128, 128, 128, 128, 1, 128, // + 0, 1, 128, 128, 128, 128, 2, 128, // + 128, 128, 0, 128, 128, 128, 1, 128, // + 0, 128, 1, 128, 128, 128, 2, 128, // + 128, 0, 1, 128, 128, 128, 2, 128, // + 0, 1, 2, 128, 128, 128, 3, 128, // + 128, 128, 128, 0, 128, 128, 1, 128, // + 0, 128, 128, 1, 128, 128, 2, 128, // + 128, 0, 128, 1, 128, 128, 2, 128, // + 0, 1, 128, 2, 128, 128, 3, 128, // + 128, 128, 0, 1, 128, 128, 2, 128, // + 0, 128, 1, 2, 128, 128, 3, 128, // + 128, 0, 1, 2, 128, 128, 3, 128, // + 0, 1, 2, 3, 128, 128, 4, 128, // + 128, 128, 128, 128, 0, 128, 1, 128, // + 0, 128, 128, 128, 1, 128, 2, 128, // + 128, 0, 128, 128, 1, 128, 2, 128, // + 0, 1, 128, 128, 2, 128, 3, 128, // + 128, 128, 0, 128, 1, 128, 2, 128, // + 0, 128, 1, 128, 2, 128, 3, 128, // + 128, 0, 1, 128, 2, 128, 3, 128, // + 0, 1, 2, 128, 3, 128, 4, 128, // + 128, 128, 128, 0, 1, 128, 2, 128, // + 0, 128, 128, 1, 2, 128, 3, 128, // + 128, 0, 128, 1, 2, 128, 3, 128, // + 0, 1, 128, 2, 3, 128, 4, 128, // + 128, 128, 0, 1, 2, 128, 3, 128, // + 0, 128, 1, 2, 3, 128, 4, 128, // + 128, 0, 1, 2, 3, 128, 4, 128, // + 0, 1, 2, 3, 4, 128, 5, 128, // + 128, 128, 128, 128, 128, 0, 1, 128, // + 0, 128, 128, 128, 128, 1, 2, 128, // + 128, 0, 128, 128, 128, 1, 2, 128, // + 0, 1, 128, 128, 128, 2, 3, 128, // + 128, 128, 0, 128, 128, 1, 2, 128, // + 0, 128, 1, 128, 128, 2, 3, 128, // + 128, 0, 1, 128, 128, 2, 3, 128, // + 0, 1, 2, 128, 128, 3, 4, 128, // + 128, 128, 128, 0, 128, 1, 2, 128, // + 0, 128, 128, 1, 128, 2, 3, 128, // + 128, 0, 128, 1, 128, 2, 3, 128, // + 0, 1, 128, 2, 128, 3, 4, 128, // + 128, 128, 0, 1, 128, 2, 3, 128, // + 0, 128, 1, 2, 128, 3, 4, 128, // + 128, 0, 1, 2, 128, 3, 4, 128, // + 0, 1, 2, 3, 128, 4, 5, 128, // + 128, 128, 128, 128, 0, 1, 2, 128, // + 0, 128, 128, 128, 1, 2, 3, 128, // + 128, 0, 128, 128, 1, 2, 3, 128, // + 0, 1, 128, 128, 2, 3, 4, 128, // + 128, 128, 0, 128, 1, 2, 3, 128, // + 0, 128, 1, 128, 2, 3, 4, 128, // + 128, 0, 1, 128, 2, 3, 4, 128, // + 0, 1, 2, 128, 3, 4, 5, 128, // + 128, 128, 128, 0, 1, 2, 3, 128, // + 0, 128, 128, 1, 2, 3, 4, 128, // + 128, 0, 128, 1, 2, 3, 4, 128, // + 0, 1, 128, 2, 3, 4, 5, 128, // + 128, 128, 0, 1, 2, 3, 4, 128, // + 0, 128, 1, 2, 3, 4, 5, 128, // + 128, 0, 1, 2, 3, 4, 5, 128, // + 0, 1, 2, 3, 4, 5, 6, 128, // + 128, 128, 128, 128, 128, 128, 128, 0, // + 0, 128, 128, 128, 128, 128, 128, 1, // + 128, 0, 128, 128, 128, 128, 128, 1, // + 0, 1, 128, 128, 128, 128, 128, 2, // + 128, 128, 0, 128, 128, 128, 128, 1, // + 0, 128, 1, 128, 128, 128, 128, 2, // + 128, 0, 1, 128, 128, 128, 128, 2, // + 0, 1, 2, 128, 128, 128, 128, 3, // + 128, 128, 128, 0, 128, 128, 128, 1, // + 0, 128, 128, 1, 128, 128, 128, 2, // + 128, 0, 128, 1, 128, 128, 128, 2, // + 0, 1, 128, 2, 128, 128, 128, 3, // + 128, 128, 0, 1, 128, 128, 128, 2, // + 0, 128, 1, 2, 128, 128, 128, 3, // + 128, 0, 1, 2, 128, 128, 128, 3, // + 0, 1, 2, 3, 128, 128, 128, 4, // + 128, 128, 128, 128, 0, 128, 128, 1, // + 0, 128, 128, 128, 1, 128, 128, 2, // + 128, 0, 128, 128, 1, 128, 128, 2, // + 0, 1, 128, 128, 2, 128, 128, 3, // + 128, 128, 0, 128, 1, 128, 128, 2, // + 0, 128, 1, 128, 2, 128, 128, 3, // + 128, 0, 1, 128, 2, 128, 128, 3, // + 0, 1, 2, 128, 3, 128, 128, 4, // + 128, 128, 128, 0, 1, 128, 128, 2, // + 0, 128, 128, 1, 2, 128, 128, 3, // + 128, 0, 128, 1, 2, 128, 128, 3, // + 0, 1, 128, 2, 3, 128, 128, 4, // + 128, 128, 0, 1, 2, 128, 128, 3, // + 0, 128, 1, 2, 3, 128, 128, 4, // + 128, 0, 1, 2, 3, 128, 128, 4, // + 0, 1, 2, 3, 4, 128, 128, 5, // + 128, 128, 128, 128, 128, 0, 128, 1, // + 0, 128, 128, 128, 128, 1, 128, 2, // + 128, 0, 128, 128, 128, 1, 128, 2, // + 0, 1, 128, 128, 128, 2, 128, 3, // + 128, 128, 0, 128, 128, 1, 128, 2, // + 0, 128, 1, 128, 128, 2, 128, 3, // + 128, 0, 1, 128, 128, 2, 128, 3, // + 0, 1, 2, 128, 128, 3, 128, 4, // + 128, 128, 128, 0, 128, 1, 128, 2, // + 0, 128, 128, 1, 128, 2, 128, 3, // + 128, 0, 128, 1, 128, 2, 128, 3, // + 0, 1, 128, 2, 128, 3, 128, 4, // + 128, 128, 0, 1, 128, 2, 128, 3, // + 0, 128, 1, 2, 128, 3, 128, 4, // + 128, 0, 1, 2, 128, 3, 128, 4, // + 0, 1, 2, 3, 128, 4, 128, 5, // + 128, 128, 128, 128, 0, 1, 128, 2, // + 0, 128, 128, 128, 1, 2, 128, 3, // + 128, 0, 128, 128, 1, 2, 128, 3, // + 0, 1, 128, 128, 2, 3, 128, 4, // + 128, 128, 0, 128, 1, 2, 128, 3, // + 0, 128, 1, 128, 2, 3, 128, 4, // + 128, 0, 1, 128, 2, 3, 128, 4, // + 0, 1, 2, 128, 3, 4, 128, 5, // + 128, 128, 128, 0, 1, 2, 128, 3, // + 0, 128, 128, 1, 2, 3, 128, 4, // + 128, 0, 128, 1, 2, 3, 128, 4, // + 0, 1, 128, 2, 3, 4, 128, 5, // + 128, 128, 0, 1, 2, 3, 128, 4, // + 0, 128, 1, 2, 3, 4, 128, 5, // + 128, 0, 1, 2, 3, 4, 128, 5, // + 0, 1, 2, 3, 4, 5, 128, 6, // + 128, 128, 128, 128, 128, 128, 0, 1, // + 0, 128, 128, 128, 128, 128, 1, 2, // + 128, 0, 128, 128, 128, 128, 1, 2, // + 0, 1, 128, 128, 128, 128, 2, 3, // + 128, 128, 0, 128, 128, 128, 1, 2, // + 0, 128, 1, 128, 128, 128, 2, 3, // + 128, 0, 1, 128, 128, 128, 2, 3, // + 0, 1, 2, 128, 128, 128, 3, 4, // + 128, 128, 128, 0, 128, 128, 1, 2, // + 0, 128, 128, 1, 128, 128, 2, 3, // + 128, 0, 128, 1, 128, 128, 2, 3, // + 0, 1, 128, 2, 128, 128, 3, 4, // + 128, 128, 0, 1, 128, 128, 2, 3, // + 0, 128, 1, 2, 128, 128, 3, 4, // + 128, 0, 1, 2, 128, 128, 3, 4, // + 0, 1, 2, 3, 128, 128, 4, 5, // + 128, 128, 128, 128, 0, 128, 1, 2, // + 0, 128, 128, 128, 1, 128, 2, 3, // + 128, 0, 128, 128, 1, 128, 2, 3, // + 0, 1, 128, 128, 2, 128, 3, 4, // + 128, 128, 0, 128, 1, 128, 2, 3, // + 0, 128, 1, 128, 2, 128, 3, 4, // + 128, 0, 1, 128, 2, 128, 3, 4, // + 0, 1, 2, 128, 3, 128, 4, 5, // + 128, 128, 128, 0, 1, 128, 2, 3, // + 0, 128, 128, 1, 2, 128, 3, 4, // + 128, 0, 128, 1, 2, 128, 3, 4, // + 0, 1, 128, 2, 3, 128, 4, 5, // + 128, 128, 0, 1, 2, 128, 3, 4, // + 0, 128, 1, 2, 3, 128, 4, 5, // + 128, 0, 1, 2, 3, 128, 4, 5, // + 0, 1, 2, 3, 4, 128, 5, 6, // + 128, 128, 128, 128, 128, 0, 1, 2, // + 0, 128, 128, 128, 128, 1, 2, 3, // + 128, 0, 128, 128, 128, 1, 2, 3, // + 0, 1, 128, 128, 128, 2, 3, 4, // + 128, 128, 0, 128, 128, 1, 2, 3, // + 0, 128, 1, 128, 128, 2, 3, 4, // + 128, 0, 1, 128, 128, 2, 3, 4, // + 0, 1, 2, 128, 128, 3, 4, 5, // + 128, 128, 128, 0, 128, 1, 2, 3, // + 0, 128, 128, 1, 128, 2, 3, 4, // + 128, 0, 128, 1, 128, 2, 3, 4, // + 0, 1, 128, 2, 128, 3, 4, 5, // + 128, 128, 0, 1, 128, 2, 3, 4, // + 0, 128, 1, 2, 128, 3, 4, 5, // + 128, 0, 1, 2, 128, 3, 4, 5, // + 0, 1, 2, 3, 128, 4, 5, 6, // + 128, 128, 128, 128, 0, 1, 2, 3, // + 0, 128, 128, 128, 1, 2, 3, 4, // + 128, 0, 128, 128, 1, 2, 3, 4, // + 0, 1, 128, 128, 2, 3, 4, 5, // + 128, 128, 0, 128, 1, 2, 3, 4, // + 0, 128, 1, 128, 2, 3, 4, 5, // + 128, 0, 1, 128, 2, 3, 4, 5, // + 0, 1, 2, 128, 3, 4, 5, 6, // + 128, 128, 128, 0, 1, 2, 3, 4, // + 0, 128, 128, 1, 2, 3, 4, 5, // + 128, 0, 128, 1, 2, 3, 4, 5, // + 0, 1, 128, 2, 3, 4, 5, 6, // + 128, 128, 0, 1, 2, 3, 4, 5, // + 0, 128, 1, 2, 3, 4, 5, 6, // + 128, 0, 1, 2, 3, 4, 5, 6, // + 0, 1, 2, 3, 4, 5, 6, 7}; + return Load(du8, table + mask_bits * 8); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE svuint8_t LaneIndicesFromByteIndices(D, svuint8_t idx) { + return idx; +} +template <class D, class DU = RebindToUnsigned<D>, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<DU> LaneIndicesFromByteIndices(D, svuint8_t idx) { + return PromoteTo(DU(), idx); +} + +// General case when we don't know the vector size, 8 elements at a time. +template <class V> +HWY_INLINE V ExpandLoop(V v, svbool_t mask) { + const DFromV<V> d; + uint8_t mask_bytes[256 / 8]; + StoreMaskBits(d, mask, mask_bytes); + + // ShiftLeftLanes is expensive, so we're probably better off storing to memory + // and loading the final result. + alignas(16) TFromV<V> out[2 * MaxLanes(d)]; + + svbool_t next = svpfalse_b(); + size_t input_consumed = 0; + const V iota = Iota(d, 0); + for (size_t i = 0; i < Lanes(d); i += 8) { + uint64_t mask_bits = mask_bytes[i / 8]; + + // We want to skip past the v lanes already consumed. There is no + // instruction for variable-shift-reg, but we can splice. + const V vH = detail::Splice(v, v, next); + input_consumed += PopCount(mask_bits); + next = detail::GeN(iota, static_cast<TFromV<V>>(input_consumed)); + + const auto idx = detail::LaneIndicesFromByteIndices( + d, detail::IndicesForExpandFromBits(mask_bits)); + const V expand = TableLookupLanes(vH, idx); + StoreU(expand, d, out + i); + } + return LoadU(d, out); +} + +} // namespace detail + +template <class V, HWY_IF_T_SIZE_V(V, 1)> +HWY_API V Expand(V v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE + const DFromV<V> d; + uint8_t mask_bytes[256 / 8]; + StoreMaskBits(d, mask, mask_bytes); + const uint64_t maskL = mask_bytes[0]; + const uint64_t maskH = mask_bytes[1]; + + // We want to skip past the v bytes already consumed by expandL. There is no + // instruction for shift-reg by variable bytes, but we can splice. Instead of + // GeN, Not(FirstN()) would also work. + using T = TFromV<V>; + const T countL = static_cast<T>(PopCount(maskL)); + const V vH = detail::Splice(v, v, detail::GeN(Iota(d, 0), countL)); + + const svuint8_t idxL = detail::IndicesForExpandFromBits(maskL); + const svuint8_t idxH = detail::IndicesForExpandFromBits(maskH); + return Combine(d, TableLookupLanes(vH, idxH), TableLookupLanes(v, idxL)); +#else + return detail::ExpandLoop(v, mask); +#endif +} + +template <class V, HWY_IF_T_SIZE_V(V, 2)> +HWY_API V Expand(V v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE // 16x8 + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du16; + const Rebind<uint8_t, decltype(d)> du8; + // Convert mask into bitfield via horizontal sum (faster than ORV) of 8 bits. + // Pre-multiply by N so we can use it as an offset for Load. + const svuint16_t bits = Shl(Set(du16, 1), Iota(du16, 3)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply + // the nibble trick used below because not all indices fit within one lane. + alignas(16) static constexpr uint8_t table[8 * 256] = { + // PrintExpand16x8LaneTables + 255, 255, 255, 255, 255, 255, 255, 255, // + 0, 255, 255, 255, 255, 255, 255, 255, // + 255, 0, 255, 255, 255, 255, 255, 255, // + 0, 1, 255, 255, 255, 255, 255, 255, // + 255, 255, 0, 255, 255, 255, 255, 255, // + 0, 255, 1, 255, 255, 255, 255, 255, // + 255, 0, 1, 255, 255, 255, 255, 255, // + 0, 1, 2, 255, 255, 255, 255, 255, // + 255, 255, 255, 0, 255, 255, 255, 255, // + 0, 255, 255, 1, 255, 255, 255, 255, // + 255, 0, 255, 1, 255, 255, 255, 255, // + 0, 1, 255, 2, 255, 255, 255, 255, // + 255, 255, 0, 1, 255, 255, 255, 255, // + 0, 255, 1, 2, 255, 255, 255, 255, // + 255, 0, 1, 2, 255, 255, 255, 255, // + 0, 1, 2, 3, 255, 255, 255, 255, // + 255, 255, 255, 255, 0, 255, 255, 255, // + 0, 255, 255, 255, 1, 255, 255, 255, // + 255, 0, 255, 255, 1, 255, 255, 255, // + 0, 1, 255, 255, 2, 255, 255, 255, // + 255, 255, 0, 255, 1, 255, 255, 255, // + 0, 255, 1, 255, 2, 255, 255, 255, // + 255, 0, 1, 255, 2, 255, 255, 255, // + 0, 1, 2, 255, 3, 255, 255, 255, // + 255, 255, 255, 0, 1, 255, 255, 255, // + 0, 255, 255, 1, 2, 255, 255, 255, // + 255, 0, 255, 1, 2, 255, 255, 255, // + 0, 1, 255, 2, 3, 255, 255, 255, // + 255, 255, 0, 1, 2, 255, 255, 255, // + 0, 255, 1, 2, 3, 255, 255, 255, // + 255, 0, 1, 2, 3, 255, 255, 255, // + 0, 1, 2, 3, 4, 255, 255, 255, // + 255, 255, 255, 255, 255, 0, 255, 255, // + 0, 255, 255, 255, 255, 1, 255, 255, // + 255, 0, 255, 255, 255, 1, 255, 255, // + 0, 1, 255, 255, 255, 2, 255, 255, // + 255, 255, 0, 255, 255, 1, 255, 255, // + 0, 255, 1, 255, 255, 2, 255, 255, // + 255, 0, 1, 255, 255, 2, 255, 255, // + 0, 1, 2, 255, 255, 3, 255, 255, // + 255, 255, 255, 0, 255, 1, 255, 255, // + 0, 255, 255, 1, 255, 2, 255, 255, // + 255, 0, 255, 1, 255, 2, 255, 255, // + 0, 1, 255, 2, 255, 3, 255, 255, // + 255, 255, 0, 1, 255, 2, 255, 255, // + 0, 255, 1, 2, 255, 3, 255, 255, // + 255, 0, 1, 2, 255, 3, 255, 255, // + 0, 1, 2, 3, 255, 4, 255, 255, // + 255, 255, 255, 255, 0, 1, 255, 255, // + 0, 255, 255, 255, 1, 2, 255, 255, // + 255, 0, 255, 255, 1, 2, 255, 255, // + 0, 1, 255, 255, 2, 3, 255, 255, // + 255, 255, 0, 255, 1, 2, 255, 255, // + 0, 255, 1, 255, 2, 3, 255, 255, // + 255, 0, 1, 255, 2, 3, 255, 255, // + 0, 1, 2, 255, 3, 4, 255, 255, // + 255, 255, 255, 0, 1, 2, 255, 255, // + 0, 255, 255, 1, 2, 3, 255, 255, // + 255, 0, 255, 1, 2, 3, 255, 255, // + 0, 1, 255, 2, 3, 4, 255, 255, // + 255, 255, 0, 1, 2, 3, 255, 255, // + 0, 255, 1, 2, 3, 4, 255, 255, // + 255, 0, 1, 2, 3, 4, 255, 255, // + 0, 1, 2, 3, 4, 5, 255, 255, // + 255, 255, 255, 255, 255, 255, 0, 255, // + 0, 255, 255, 255, 255, 255, 1, 255, // + 255, 0, 255, 255, 255, 255, 1, 255, // + 0, 1, 255, 255, 255, 255, 2, 255, // + 255, 255, 0, 255, 255, 255, 1, 255, // + 0, 255, 1, 255, 255, 255, 2, 255, // + 255, 0, 1, 255, 255, 255, 2, 255, // + 0, 1, 2, 255, 255, 255, 3, 255, // + 255, 255, 255, 0, 255, 255, 1, 255, // + 0, 255, 255, 1, 255, 255, 2, 255, // + 255, 0, 255, 1, 255, 255, 2, 255, // + 0, 1, 255, 2, 255, 255, 3, 255, // + 255, 255, 0, 1, 255, 255, 2, 255, // + 0, 255, 1, 2, 255, 255, 3, 255, // + 255, 0, 1, 2, 255, 255, 3, 255, // + 0, 1, 2, 3, 255, 255, 4, 255, // + 255, 255, 255, 255, 0, 255, 1, 255, // + 0, 255, 255, 255, 1, 255, 2, 255, // + 255, 0, 255, 255, 1, 255, 2, 255, // + 0, 1, 255, 255, 2, 255, 3, 255, // + 255, 255, 0, 255, 1, 255, 2, 255, // + 0, 255, 1, 255, 2, 255, 3, 255, // + 255, 0, 1, 255, 2, 255, 3, 255, // + 0, 1, 2, 255, 3, 255, 4, 255, // + 255, 255, 255, 0, 1, 255, 2, 255, // + 0, 255, 255, 1, 2, 255, 3, 255, // + 255, 0, 255, 1, 2, 255, 3, 255, // + 0, 1, 255, 2, 3, 255, 4, 255, // + 255, 255, 0, 1, 2, 255, 3, 255, // + 0, 255, 1, 2, 3, 255, 4, 255, // + 255, 0, 1, 2, 3, 255, 4, 255, // + 0, 1, 2, 3, 4, 255, 5, 255, // + 255, 255, 255, 255, 255, 0, 1, 255, // + 0, 255, 255, 255, 255, 1, 2, 255, // + 255, 0, 255, 255, 255, 1, 2, 255, // + 0, 1, 255, 255, 255, 2, 3, 255, // + 255, 255, 0, 255, 255, 1, 2, 255, // + 0, 255, 1, 255, 255, 2, 3, 255, // + 255, 0, 1, 255, 255, 2, 3, 255, // + 0, 1, 2, 255, 255, 3, 4, 255, // + 255, 255, 255, 0, 255, 1, 2, 255, // + 0, 255, 255, 1, 255, 2, 3, 255, // + 255, 0, 255, 1, 255, 2, 3, 255, // + 0, 1, 255, 2, 255, 3, 4, 255, // + 255, 255, 0, 1, 255, 2, 3, 255, // + 0, 255, 1, 2, 255, 3, 4, 255, // + 255, 0, 1, 2, 255, 3, 4, 255, // + 0, 1, 2, 3, 255, 4, 5, 255, // + 255, 255, 255, 255, 0, 1, 2, 255, // + 0, 255, 255, 255, 1, 2, 3, 255, // + 255, 0, 255, 255, 1, 2, 3, 255, // + 0, 1, 255, 255, 2, 3, 4, 255, // + 255, 255, 0, 255, 1, 2, 3, 255, // + 0, 255, 1, 255, 2, 3, 4, 255, // + 255, 0, 1, 255, 2, 3, 4, 255, // + 0, 1, 2, 255, 3, 4, 5, 255, // + 255, 255, 255, 0, 1, 2, 3, 255, // + 0, 255, 255, 1, 2, 3, 4, 255, // + 255, 0, 255, 1, 2, 3, 4, 255, // + 0, 1, 255, 2, 3, 4, 5, 255, // + 255, 255, 0, 1, 2, 3, 4, 255, // + 0, 255, 1, 2, 3, 4, 5, 255, // + 255, 0, 1, 2, 3, 4, 5, 255, // + 0, 1, 2, 3, 4, 5, 6, 255, // + 255, 255, 255, 255, 255, 255, 255, 0, // + 0, 255, 255, 255, 255, 255, 255, 1, // + 255, 0, 255, 255, 255, 255, 255, 1, // + 0, 1, 255, 255, 255, 255, 255, 2, // + 255, 255, 0, 255, 255, 255, 255, 1, // + 0, 255, 1, 255, 255, 255, 255, 2, // + 255, 0, 1, 255, 255, 255, 255, 2, // + 0, 1, 2, 255, 255, 255, 255, 3, // + 255, 255, 255, 0, 255, 255, 255, 1, // + 0, 255, 255, 1, 255, 255, 255, 2, // + 255, 0, 255, 1, 255, 255, 255, 2, // + 0, 1, 255, 2, 255, 255, 255, 3, // + 255, 255, 0, 1, 255, 255, 255, 2, // + 0, 255, 1, 2, 255, 255, 255, 3, // + 255, 0, 1, 2, 255, 255, 255, 3, // + 0, 1, 2, 3, 255, 255, 255, 4, // + 255, 255, 255, 255, 0, 255, 255, 1, // + 0, 255, 255, 255, 1, 255, 255, 2, // + 255, 0, 255, 255, 1, 255, 255, 2, // + 0, 1, 255, 255, 2, 255, 255, 3, // + 255, 255, 0, 255, 1, 255, 255, 2, // + 0, 255, 1, 255, 2, 255, 255, 3, // + 255, 0, 1, 255, 2, 255, 255, 3, // + 0, 1, 2, 255, 3, 255, 255, 4, // + 255, 255, 255, 0, 1, 255, 255, 2, // + 0, 255, 255, 1, 2, 255, 255, 3, // + 255, 0, 255, 1, 2, 255, 255, 3, // + 0, 1, 255, 2, 3, 255, 255, 4, // + 255, 255, 0, 1, 2, 255, 255, 3, // + 0, 255, 1, 2, 3, 255, 255, 4, // + 255, 0, 1, 2, 3, 255, 255, 4, // + 0, 1, 2, 3, 4, 255, 255, 5, // + 255, 255, 255, 255, 255, 0, 255, 1, // + 0, 255, 255, 255, 255, 1, 255, 2, // + 255, 0, 255, 255, 255, 1, 255, 2, // + 0, 1, 255, 255, 255, 2, 255, 3, // + 255, 255, 0, 255, 255, 1, 255, 2, // + 0, 255, 1, 255, 255, 2, 255, 3, // + 255, 0, 1, 255, 255, 2, 255, 3, // + 0, 1, 2, 255, 255, 3, 255, 4, // + 255, 255, 255, 0, 255, 1, 255, 2, // + 0, 255, 255, 1, 255, 2, 255, 3, // + 255, 0, 255, 1, 255, 2, 255, 3, // + 0, 1, 255, 2, 255, 3, 255, 4, // + 255, 255, 0, 1, 255, 2, 255, 3, // + 0, 255, 1, 2, 255, 3, 255, 4, // + 255, 0, 1, 2, 255, 3, 255, 4, // + 0, 1, 2, 3, 255, 4, 255, 5, // + 255, 255, 255, 255, 0, 1, 255, 2, // + 0, 255, 255, 255, 1, 2, 255, 3, // + 255, 0, 255, 255, 1, 2, 255, 3, // + 0, 1, 255, 255, 2, 3, 255, 4, // + 255, 255, 0, 255, 1, 2, 255, 3, // + 0, 255, 1, 255, 2, 3, 255, 4, // + 255, 0, 1, 255, 2, 3, 255, 4, // + 0, 1, 2, 255, 3, 4, 255, 5, // + 255, 255, 255, 0, 1, 2, 255, 3, // + 0, 255, 255, 1, 2, 3, 255, 4, // + 255, 0, 255, 1, 2, 3, 255, 4, // + 0, 1, 255, 2, 3, 4, 255, 5, // + 255, 255, 0, 1, 2, 3, 255, 4, // + 0, 255, 1, 2, 3, 4, 255, 5, // + 255, 0, 1, 2, 3, 4, 255, 5, // + 0, 1, 2, 3, 4, 5, 255, 6, // + 255, 255, 255, 255, 255, 255, 0, 1, // + 0, 255, 255, 255, 255, 255, 1, 2, // + 255, 0, 255, 255, 255, 255, 1, 2, // + 0, 1, 255, 255, 255, 255, 2, 3, // + 255, 255, 0, 255, 255, 255, 1, 2, // + 0, 255, 1, 255, 255, 255, 2, 3, // + 255, 0, 1, 255, 255, 255, 2, 3, // + 0, 1, 2, 255, 255, 255, 3, 4, // + 255, 255, 255, 0, 255, 255, 1, 2, // + 0, 255, 255, 1, 255, 255, 2, 3, // + 255, 0, 255, 1, 255, 255, 2, 3, // + 0, 1, 255, 2, 255, 255, 3, 4, // + 255, 255, 0, 1, 255, 255, 2, 3, // + 0, 255, 1, 2, 255, 255, 3, 4, // + 255, 0, 1, 2, 255, 255, 3, 4, // + 0, 1, 2, 3, 255, 255, 4, 5, // + 255, 255, 255, 255, 0, 255, 1, 2, // + 0, 255, 255, 255, 1, 255, 2, 3, // + 255, 0, 255, 255, 1, 255, 2, 3, // + 0, 1, 255, 255, 2, 255, 3, 4, // + 255, 255, 0, 255, 1, 255, 2, 3, // + 0, 255, 1, 255, 2, 255, 3, 4, // + 255, 0, 1, 255, 2, 255, 3, 4, // + 0, 1, 2, 255, 3, 255, 4, 5, // + 255, 255, 255, 0, 1, 255, 2, 3, // + 0, 255, 255, 1, 2, 255, 3, 4, // + 255, 0, 255, 1, 2, 255, 3, 4, // + 0, 1, 255, 2, 3, 255, 4, 5, // + 255, 255, 0, 1, 2, 255, 3, 4, // + 0, 255, 1, 2, 3, 255, 4, 5, // + 255, 0, 1, 2, 3, 255, 4, 5, // + 0, 1, 2, 3, 4, 255, 5, 6, // + 255, 255, 255, 255, 255, 0, 1, 2, // + 0, 255, 255, 255, 255, 1, 2, 3, // + 255, 0, 255, 255, 255, 1, 2, 3, // + 0, 1, 255, 255, 255, 2, 3, 4, // + 255, 255, 0, 255, 255, 1, 2, 3, // + 0, 255, 1, 255, 255, 2, 3, 4, // + 255, 0, 1, 255, 255, 2, 3, 4, // + 0, 1, 2, 255, 255, 3, 4, 5, // + 255, 255, 255, 0, 255, 1, 2, 3, // + 0, 255, 255, 1, 255, 2, 3, 4, // + 255, 0, 255, 1, 255, 2, 3, 4, // + 0, 1, 255, 2, 255, 3, 4, 5, // + 255, 255, 0, 1, 255, 2, 3, 4, // + 0, 255, 1, 2, 255, 3, 4, 5, // + 255, 0, 1, 2, 255, 3, 4, 5, // + 0, 1, 2, 3, 255, 4, 5, 6, // + 255, 255, 255, 255, 0, 1, 2, 3, // + 0, 255, 255, 255, 1, 2, 3, 4, // + 255, 0, 255, 255, 1, 2, 3, 4, // + 0, 1, 255, 255, 2, 3, 4, 5, // + 255, 255, 0, 255, 1, 2, 3, 4, // + 0, 255, 1, 255, 2, 3, 4, 5, // + 255, 0, 1, 255, 2, 3, 4, 5, // + 0, 1, 2, 255, 3, 4, 5, 6, // + 255, 255, 255, 0, 1, 2, 3, 4, // + 0, 255, 255, 1, 2, 3, 4, 5, // + 255, 0, 255, 1, 2, 3, 4, 5, // + 0, 1, 255, 2, 3, 4, 5, 6, // + 255, 255, 0, 1, 2, 3, 4, 5, // + 0, 255, 1, 2, 3, 4, 5, 6, // + 255, 0, 1, 2, 3, 4, 5, 6, // + 0, 1, 2, 3, 4, 5, 6, 7}; + const svuint16_t indices = PromoteTo(du16, Load(du8, table + offset)); + return TableLookupLanes(v, indices); // already zeros mask=false lanes +#else + return detail::ExpandLoop(v, mask); +#endif +} + +template <class V, HWY_IF_T_SIZE_V(V, 4)> +HWY_API V Expand(V v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE // 32x8 + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du32; + // Convert mask into bitfield via horizontal sum (faster than ORV). + const svuint32_t bits = Shl(Set(du32, 1), Iota(du32, 0)); + const size_t code = detail::SumOfLanesM(mask, bits); + + alignas(16) constexpr uint32_t packed_array[256] = { + // PrintExpand32x8. + 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0, + 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10, + 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0, + 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210, + 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0, + 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10, + 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0, + 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210, + 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0, + 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10, + 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0, + 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210, + 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0, + 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10, + 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0, + 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210, + 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0, + 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10, + 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0, + 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210, + 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0, + 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10, + 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0, + 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210, + 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0, + 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10, + 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0, + 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210, + 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0, + 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10, + 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0, + 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210, + 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0, + 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10, + 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0, + 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210, + 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0, + 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10, + 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0, + 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210, + 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0, + 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10, + 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210}; + + // For lane i, shift the i-th 4-bit index down and mask with 0xF because + // svtbl zeros outputs if the index is out of bounds. + const svuint32_t packed = Set(du32, packed_array[code]); + const svuint32_t indices = detail::AndN(Shr(packed, svindex_u32(0, 4)), 0xF); + return TableLookupLanes(v, indices); // already zeros mask=false lanes +#elif HWY_TARGET == HWY_SVE2_128 // 32x4 + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du32; + // Convert mask into bitfield via horizontal sum (faster than ORV). + const svuint32_t bits = Shl(Set(du32, 1), Iota(du32, 0)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + alignas(16) constexpr uint32_t packed_array[16] = { + // PrintExpand64x4Nibble - same for 32x4. + 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, + 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, + 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down and mask with 0xF because + // svtbl zeros outputs if the index is out of bounds. + const svuint32_t packed = Set(du32, packed_array[offset]); + const svuint32_t indices = detail::AndN(Shr(packed, svindex_u32(0, 4)), 0xF); + return TableLookupLanes(v, indices); // already zeros mask=false lanes +#else + return detail::ExpandLoop(v, mask); +#endif +} + +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_API V Expand(V v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE // 64x4 + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du64; + + // Convert mask into bitfield via horizontal sum (faster than ORV) of masked + // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for + // SetTableIndices. + const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + alignas(16) static constexpr uint64_t table[4 * 16] = { + // PrintExpand64x4Tables - small enough to store uncompressed. + 255, 255, 255, 255, 0, 255, 255, 255, 255, 0, 255, 255, 0, 1, 255, 255, + 255, 255, 0, 255, 0, 255, 1, 255, 255, 0, 1, 255, 0, 1, 2, 255, + 255, 255, 255, 0, 0, 255, 255, 1, 255, 0, 255, 1, 0, 1, 255, 2, + 255, 255, 0, 1, 0, 255, 1, 2, 255, 0, 1, 2, 0, 1, 2, 3}; + // This already zeros mask=false lanes. + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +#elif HWY_TARGET == HWY_SVE2_128 // 64x2 + // Same as Compress, just zero out the mask=false lanes. + return IfThenElseZero(mask, Compress(v, mask)); +#else + return detail::ExpandLoop(v, mask); +#endif +} + +// ------------------------------ LoadExpand + +template <class D> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + return Expand(LoadU(d, unaligned), mask); +} + +// ------------------------------ MulEven (InterleaveEven) + +#if HWY_SVE_HAVE_2 +namespace detail { +#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \ + return sv##OP##_##CHAR##BITS(a, b); \ + } + +HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb) +#undef HWY_SVE_MUL_EVEN +} // namespace detail +#endif + +template <class V, class DW = RepartitionToWide<DFromV<V>>, + HWY_IF_T_SIZE_V(V, 4)> +HWY_API VFromD<DW> MulEven(const V a, const V b) { +#if HWY_SVE_HAVE_2 + return BitCast(DW(), detail::MulEvenNative(a, b)); +#else + const auto lo = Mul(a, b); + const auto hi = MulHigh(a, b); + return BitCast(DW(), detail::InterleaveEven(lo, hi)); +#endif +} + +HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) { + const auto lo = Mul(a, b); + const auto hi = MulHigh(a, b); + return detail::InterleaveEven(lo, hi); +} + +HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) { + const auto lo = Mul(a, b); + const auto hi = MulHigh(a, b); + return detail::InterleaveOdd(lo, hi); +} + +// ------------------------------ WidenMulPairwiseAdd +template <size_t N, int kPow2> +HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, + svuint16_t a, svuint16_t b) { + // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16. + const RebindToUnsigned<decltype(df32)> du32; + // Using shift/and instead of Zip leads to the odd/even order that + // RearrangeToOddPlusEven prefers. + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), + Mul(BitCast(df32, ao), BitCast(df32, bo))); +} + +template <size_t N, int kPow2> +HWY_API svint32_t WidenMulPairwiseAdd(Simd<int32_t, N, kPow2> d32, + svint16_t a, svint16_t b) { +#if HWY_SVE_HAVE_2 + (void)d32; + return svmlalt_s32(svmullb_s32(a, b), a, b); +#else + const svbool_t pg = detail::PTrue(d32); + // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers. + // Fortunately SVE has sign-extension for the even lanes. + const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a)); + const svint32_t be = svexth_s32_x(pg, BitCast(d32, b)); + const svint32_t ao = ShiftRight<16>(BitCast(d32, a)); + const svint32_t bo = ShiftRight<16>(BitCast(d32, b)); + return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be); +#endif +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template <size_t N, int kPow2> +HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32, + svuint16_t a, svuint16_t b, + const svfloat32_t sum0, + svfloat32_t& sum1) { + // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16. + const RebindToUnsigned<decltype(df32)> du32; + // Using shift/and instead of Zip leads to the odd/even order that + // RearrangeToOddPlusEven prefers. + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +template <size_t N, int kPow2> +HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, + svint16_t a, svint16_t b, + const svint32_t sum0, + svint32_t& sum1) { +#if HWY_SVE_HAVE_2 + (void)d32; + sum1 = svmlalt_s32(sum1, a, b); + return svmlalb_s32(sum0, a, b); +#else + const svbool_t pg = detail::PTrue(d32); + // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers. + // Fortunately SVE has sign-extension for the even lanes. + const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a)); + const svint32_t be = svexth_s32_x(pg, BitCast(d32, b)); + const svint32_t ao = ShiftRight<16>(BitCast(d32, a)); + const svint32_t bo = ShiftRight<16>(BitCast(d32, b)); + sum1 = svmla_s32_x(pg, sum1, ao, bo); + return svmla_s32_x(pg, sum0, ae, be); +#endif +} + +// ------------------------------ RearrangeToOddPlusEven +template <class VW> +HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { + // sum0 is the sum of bottom/even lanes and sum1 of top/odd lanes. + return Add(sum0, sum1); +} + +// ------------------------------ AESRound / CLMul + +#if defined(__ARM_FEATURE_SVE2_AES) || \ + (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH) + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) { + // It is not clear whether E and MC fuse like they did on NEON. + return Xor(svaesmc_u8(svaese_u8(state, svdup_n_u8(0))), round_key); +} + +HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) { + return Xor(svaese_u8(state, svdup_n_u8(0)), round_key); +} + +HWY_API svuint8_t AESInvMixColumns(svuint8_t state) { + return svaesimc_u8(state); +} + +HWY_API svuint8_t AESRoundInv(svuint8_t state, svuint8_t round_key) { + return Xor(svaesimc_u8(svaesd_u8(state, svdup_n_u8(0))), round_key); +} + +HWY_API svuint8_t AESLastRoundInv(svuint8_t state, svuint8_t round_key) { + return Xor(svaesd_u8(state, svdup_n_u8(0)), round_key); +} + +template <uint8_t kRcon> +HWY_API svuint8_t AESKeyGenAssist(svuint8_t v) { + alignas(16) static constexpr uint8_t kRconXorMask[16] = { + 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; + alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { + 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; + const DFromV<decltype(v)> d; + const Repartition<uint32_t, decltype(d)> du32; + const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); + const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask)); + return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle)); +} + +HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) { + return svpmullb_pair(a, b); +} + +HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) { + return svpmullt_pair(a, b); +} + +#endif // __ARM_FEATURE_SVE2_AES + +// ------------------------------ Lt128 + +namespace detail { +#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <size_t N, int kPow2> \ + HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \ + return sv##OP##_b##BITS(m, m); \ + } + +HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool +HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool +#undef HWY_SVE_DUP + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +template <class D> +HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t eqHx = Eq(a, b); // only odd lanes used + // Convert to vector: more pipelines can execute vector TRN* instructions + // than the predicate version. + const svuint64_t ltHL = VecFromMask(d, Lt(a, b)); + // Move into upper lane: ltL if the upper half is equal, otherwise ltH. + // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated. + const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL); + // Duplicate upper lane into lower. + return DupOdd(ltHx); +} +#endif +} // namespace detail + +template <class D> +HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return MaskFromVec(detail::Lt128Vec(d, a, b)); +#else + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t eqHx = Eq(a, b); // only odd lanes used + const svbool_t ltHL = Lt(a, b); + // Move into upper lane: ltL if the upper half is equal, otherwise ltH. + const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL); + // Duplicate upper lane into lower. + return detail::DupOddB(d, ltHx); +#endif // HWY_TARGET != HWY_SVE_256 +} + +// ------------------------------ Lt128Upper + +template <class D> +HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t ltHL = Lt(a, b); + return detail::DupOddB(d, ltHL); +} + +// ------------------------------ Eq128, Ne128 + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +namespace detail { + +template <class D> +HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + // Convert to vector: more pipelines can execute vector TRN* instructions + // than the predicate version. + const svuint64_t eqHL = VecFromMask(d, Eq(a, b)); + // Duplicate upper and lower. + const svuint64_t eqHH = DupOdd(eqHL); + const svuint64_t eqLL = DupEven(eqHL); + return And(eqLL, eqHH); +} + +template <class D> +HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + // Convert to vector: more pipelines can execute vector TRN* instructions + // than the predicate version. + const svuint64_t neHL = VecFromMask(d, Ne(a, b)); + // Duplicate upper and lower. + const svuint64_t neHH = DupOdd(neHL); + const svuint64_t neLL = DupEven(neHL); + return Or(neLL, neHH); +} + +} // namespace detail +#endif + +template <class D> +HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return MaskFromVec(detail::Eq128Vec(d, a, b)); +#else + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t eqHL = Eq(a, b); + const svbool_t eqHH = detail::DupOddB(d, eqHL); + const svbool_t eqLL = detail::DupEvenB(d, eqHL); + return And(eqLL, eqHH); +#endif // HWY_TARGET != HWY_SVE_256 +} + +template <class D> +HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return MaskFromVec(detail::Ne128Vec(d, a, b)); +#else + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t neHL = Ne(a, b); + const svbool_t neHH = detail::DupOddB(d, neHL); + const svbool_t neLL = detail::DupEvenB(d, neHL); + return Or(neLL, neHH); +#endif // HWY_TARGET != HWY_SVE_256 +} + +// ------------------------------ Eq128Upper, Ne128Upper + +template <class D> +HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t eqHL = Eq(a, b); + return detail::DupOddB(d, eqHL); +} + +template <class D> +HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const svbool_t neHL = Ne(a, b); + return detail::DupOddB(d, neHL); +} + +// ------------------------------ Min128, Max128 (Lt128) + +template <class D> +HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); +#else + return IfThenElse(Lt128(d, a, b), a, b); +#endif +} + +template <class D> +HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); +#else + return IfThenElse(Lt128(d, b, a), a, b); +#endif +} + +template <class D> +HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template <class D> +HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +#ifdef HWY_NATIVE_LEADING_ZERO_COUNT +#undef HWY_NATIVE_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_LEADING_ZERO_COUNT +#endif + +#define HWY_SVE_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + const DFromV<decltype(v)> d; \ + return BitCast(d, sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v)); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_LEADING_ZERO_COUNT, LeadingZeroCount, clz) +#undef HWY_SVE_LEADING_ZERO_COUNT + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V TrailingZeroCount(V v) { + return LeadingZeroCount(ReverseBits(v)); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V HighestSetBitIndex(V v) { + const DFromV<decltype(v)> d; + using T = TFromD<decltype(d)>; + return BitCast(d, Sub(Set(d, T{sizeof(T) * 8 - 1}), LeadingZeroCount(v))); +} + +// ================================================== END MACROS +namespace detail { // for code folding +#undef HWY_SVE_ALL_PTRUE +#undef HWY_SVE_D +#undef HWY_SVE_FOREACH +#undef HWY_SVE_FOREACH_F +#undef HWY_SVE_FOREACH_F16 +#undef HWY_SVE_FOREACH_F32 +#undef HWY_SVE_FOREACH_F64 +#undef HWY_SVE_FOREACH_I +#undef HWY_SVE_FOREACH_I08 +#undef HWY_SVE_FOREACH_I16 +#undef HWY_SVE_FOREACH_I32 +#undef HWY_SVE_FOREACH_I64 +#undef HWY_SVE_FOREACH_IF +#undef HWY_SVE_FOREACH_U +#undef HWY_SVE_FOREACH_U08 +#undef HWY_SVE_FOREACH_U16 +#undef HWY_SVE_FOREACH_U32 +#undef HWY_SVE_FOREACH_U64 +#undef HWY_SVE_FOREACH_UI +#undef HWY_SVE_FOREACH_UI08 +#undef HWY_SVE_FOREACH_UI16 +#undef HWY_SVE_FOREACH_UI32 +#undef HWY_SVE_FOREACH_UI64 +#undef HWY_SVE_FOREACH_UIF3264 +#undef HWY_SVE_HAVE_2 +#undef HWY_SVE_PTRUE +#undef HWY_SVE_RETV_ARGPV +#undef HWY_SVE_RETV_ARGPVN +#undef HWY_SVE_RETV_ARGPVV +#undef HWY_SVE_RETV_ARGV +#undef HWY_SVE_RETV_ARGVN +#undef HWY_SVE_RETV_ARGVV +#undef HWY_SVE_RETV_ARGVVV +#undef HWY_SVE_T +#undef HWY_SVE_UNDEFINED +#undef HWY_SVE_V + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/emu128-inl.h b/third_party/highway/hwy/ops/emu128-inl.h new file mode 100644 index 0000000000..bbab2ee252 --- /dev/null +++ b/third_party/highway/hwy/ops/emu128-inl.h @@ -0,0 +1,2704 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Single-element vectors and operations. +// External include guard in highway.h - see comment there. + +#include <cmath> // std::abs, std::isnan + +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename T> +using Full128 = Simd<T, 16 / sizeof(T), 0>; + +// (Wrapper class required for overloading comparison operators.) +template <typename T, size_t N = 16 / sizeof(T)> +struct Vec128 { + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = N; // only for DFromV + + HWY_INLINE Vec128() = default; + Vec128(const Vec128&) = default; + Vec128& operator=(const Vec128&) = default; + + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h + // relies on this for LoadInterleaved*. CAVEAT: this method of padding + // prevents using range for, especially in SumOfLanes, where it would be + // incorrect. Moving padding to another field would require handling the case + // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward. + T raw[16 / sizeof(T)] = {}; +}; + +// 0 or FF..FF, same size as Vec128. +template <typename T, size_t N = 16 / sizeof(T)> +struct Mask128 { + using Raw = hwy::MakeUnsigned<T>; + static HWY_INLINE Raw FromBool(bool b) { + return b ? static_cast<Raw>(~Raw{0}) : 0; + } + + // Must match the size of Vec128. + Raw bits[16 / sizeof(T)] = {}; +}; + +template <class V> +using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; + +template <class V> +using TFromV = typename V::PrivateT; + +// ------------------------------ Zero + +// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> v; // zero-initialized + return v; +} + +template <class D> +using VFromD = decltype(Zero(D())); + +// ------------------------------ Tuple (VFromD) +#include "hwy/ops/tuple-inl.h" + +// ------------------------------ BitCast + +template <class D, class VFrom> +HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) { + VFromD<D> to; + CopySameSize(&v, &to); + return to; +} + +// ------------------------------ ResizeBitCast + +template <class D, class VFrom> +HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) { + using DFrom = DFromV<VFrom>; + using TFrom = TFromD<DFrom>; + using TTo = TFromD<D>; + + constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom); + constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D); + constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen); + + VFromD<D> to = Zero(d); + CopyBytes<kCopyByteLen>(&v, &to); + return to; +} + +namespace detail { + +// ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if +// VFromD<DTo> is a larger vector than FromV +template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, + ToSizeTag /* to_size_tag */, + DTo d_to, DFrom /* d_from */, + VFromD<DFrom> v) { + return ResizeBitCast(d_to, v); +} + +} // namespace detail + +// ------------------------------ Set +template <class D, typename T2> +HWY_API VFromD<D> Set(D d, const T2 t) { + VFromD<D> v; + for (size_t i = 0; i < MaxLanes(d); ++i) { + v.raw[i] = static_cast<TFromD<D>>(t); + } + return v; +} + +// ------------------------------ Undefined +template <class D> +HWY_API VFromD<D> Undefined(D d) { + return Zero(d); +} + +// ------------------------------ Iota + +template <class D, typename T = TFromD<D>, typename T2> +HWY_API VFromD<D> Iota(D d, T2 first) { + VFromD<D> v; + for (size_t i = 0; i < MaxLanes(d); ++i) { + v.raw[i] = + AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i); + } + return v; +} + +// ================================================== LOGICAL + +// ------------------------------ Not +template <typename T, size_t N> +HWY_API Vec128<T, N> Not(Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + VFromD<decltype(du)> vu = BitCast(du, v); + for (size_t i = 0; i < N; ++i) { + vu.raw[i] = static_cast<TU>(~vu.raw[i]); + } + return BitCast(d, vu); +} + +// ------------------------------ And +template <typename T, size_t N> +HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + auto au = BitCast(du, a); + auto bu = BitCast(du, b); + for (size_t i = 0; i < N; ++i) { + au.raw[i] &= bu.raw[i]; + } + return BitCast(d, au); +} +template <typename T, size_t N> +HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) { + return And(a, b); +} + +// ------------------------------ AndNot +template <typename T, size_t N> +HWY_API Vec128<T, N> AndNot(Vec128<T, N> a, Vec128<T, N> b) { + return And(Not(a), b); +} + +// ------------------------------ Or +template <typename T, size_t N> +HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + auto au = BitCast(du, a); + auto bu = BitCast(du, b); + for (size_t i = 0; i < N; ++i) { + au.raw[i] |= bu.raw[i]; + } + return BitCast(d, au); +} +template <typename T, size_t N> +HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) { + return Or(a, b); +} + +// ------------------------------ Xor +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + auto au = BitCast(du, a); + auto bu = BitCast(du, b); + for (size_t i = 0; i < N; ++i) { + au.raw[i] ^= bu.raw[i]; + } + return BitCast(d, au); +} +template <typename T, size_t N> +HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) { + return Xor(a, b); +} + +// ------------------------------ Xor3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { + return Xor(x1, Xor(x2, x3)); +} + +// ------------------------------ Or3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd +template <typename T, size_t N> +HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse +template <typename T, size_t N> +HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Or(And(mask, yes), AndNot(mask, no)); +} + +// ------------------------------ CopySign +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + const DFromV<decltype(magn)> d; + const auto msb = SignBit(d); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + const DFromV<decltype(abs)> d; + return Or(abs, And(SignBit(d), sign)); +} + +// ------------------------------ BroadcastSignBit +template <typename T, size_t N> +HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) { + // This is used inside ShiftRight, so we cannot implement in terms of it. + for (size_t i = 0; i < N; ++i) { + v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0); + } + return v; +} + +// ------------------------------ Mask + +// v must be 0 or FF..FF. +template <typename T, size_t N> +HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) { + Mask128<T, N> mask; + CopySameSize(&v, &mask); + return mask; +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +template <class DTo, class MFrom> +HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) { + MFromD<DTo> to; + CopySameSize(&mask, &to); + return to; +} + +template <typename T, size_t N> +Vec128<T, N> VecFromMask(Mask128<T, N> mask) { + Vec128<T, N> v; + CopySameSize(&mask, &v); + return v; +} + +template <class D> +VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) { + return VecFromMask(mask); +} + +template <class D> +HWY_API MFromD<D> FirstN(D d, size_t n) { + MFromD<D> m; + for (size_t i = 0; i < MaxLanes(d); ++i) { + m.bits[i] = MFromD<D>::FromBool(i < n); + } + return m; +} + +// Returns mask ? yes : no. +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return IfVecThenElse(VecFromMask(mask), yes, no); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { + const DFromV<decltype(yes)> d; + return IfVecThenElse(VecFromMask(mask), yes, Zero(d)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { + const DFromV<decltype(no)> d; + return IfVecThenElse(VecFromMask(mask), Zero(d), no); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, + Vec128<T, N> no) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i]; + } + return v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { + const DFromV<decltype(v)> d; + return IfNegativeThenElse(v, Zero(d), v); +} + +// ------------------------------ Mask logical + +template <typename T, size_t N> +HWY_API Mask128<T, N> Not(Mask128<T, N> m) { + return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ================================================== SHIFTS + +// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) + +template <int kBits, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + for (size_t i = 0; i < N; ++i) { + const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << kBits; + v.raw[i] = static_cast<T>(shifted); + } + return v; +} + +template <int kBits, typename T, size_t N> +HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast<T>(v.raw[i] >> kBits); + } +#else + if (IsSigned<T>()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned<T>; + for (size_t i = 0; i < N; ++i) { + const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits); + const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0; + const size_t sign_shift = + static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits); + const TU upper = static_cast<TU>(sign << sign_shift); + v.raw[i] = static_cast<T>(shifted | upper); + } + } else { // T is unsigned + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast<T>(v.raw[i] >> kBits); + } + } +#endif + return v; +} + +// ------------------------------ RotateRight (ShiftRight) +template <int kBits, typename T, size_t N> +HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +// ------------------------------ ShiftLeftSame + +template <typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) { + for (size_t i = 0; i < N; ++i) { + const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits; + v.raw[i] = static_cast<T>(shifted); + } + return v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) { +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast<T>(v.raw[i] >> bits); + } +#else + if (IsSigned<T>()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned<T>; + for (size_t i = 0; i < N; ++i) { + const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits); + const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0; + const size_t sign_shift = + static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits); + const TU upper = static_cast<TU>(sign << sign_shift); + v.raw[i] = static_cast<T>(shifted | upper); + } + } else { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift + } + } +#endif + return v; +} + +// ------------------------------ Shl + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { + for (size_t i = 0; i < N; ++i) { + const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) + << bits.raw[i]; + v.raw[i] = static_cast<T>(shifted); + } + return v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) { +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]); + } +#else + if (IsSigned<T>()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned<T>; + for (size_t i = 0; i < N; ++i) { + const TU shifted = + static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]); + const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0; + const size_t sign_shift = static_cast<size_t>( + static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]); + const TU upper = static_cast<TU>(sign << sign_shift); + v.raw[i] = static_cast<T>(shifted | upper); + } + } else { // T is unsigned + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]); + } + } +#endif + return v; +} + +// ================================================== ARITHMETIC + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + const uint64_t a64 = static_cast<uint64_t>(a.raw[i]); + const uint64_t b64 = static_cast<uint64_t>(b.raw[i]); + a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))); + } + return a; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + const uint64_t a64 = static_cast<uint64_t>(a.raw[i]); + const uint64_t b64 = static_cast<uint64_t>(b.raw[i]); + a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))); + } + return a; +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] += b.raw[i]; + } + return a; +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] -= b.raw[i]; + } + return a; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Sub(hwy::IsFloatTag<T>(), a, b); +} +template <typename T, size_t N> +HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Add(hwy::IsFloatTag<T>(), a, b); +} + +// ------------------------------ SumsOf8 + +template <size_t N> +HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) { + Vec128<uint64_t, (N + 7) / 8> sums; + for (size_t i = 0; i < N; ++i) { + sums.raw[i / 8] += v.raw[i]; + } + return sums; +} + +// ------------------------------ SaturatedAdd +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) { + using TW = MakeSigned<MakeWide<T>>; + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<T>(HWY_MIN( + HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) + b.raw[i]), + hwy::HighestValue<T>())); + } + return a; +} + +// ------------------------------ SaturatedSub +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) { + using TW = MakeSigned<MakeWide<T>>; + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<T>(HWY_MIN( + HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) - b.raw[i]), + hwy::HighestValue<T>())); + } + return a; +} + +// ------------------------------ AverageRound +template <typename T, size_t N> +HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) { + static_assert(!IsSigned<T>(), "Only for unsigned"); + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2); + } + return a; +} + +// ------------------------------ Abs + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) { + for (size_t i = 0; i < N; ++i) { + const T s = a.raw[i]; + const T min = hwy::LimitsMin<T>(); + a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s); + } + return a; +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = std::abs(v.raw[i]); + } + return v; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> Abs(Vec128<T, N> a) { + return detail::Abs(hwy::TypeTag<T>(), a); +} + +// ------------------------------ Min/Max + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); + } + return a; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); + } + return a; +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + if (std::isnan(a.raw[i])) { + a.raw[i] = b.raw[i]; + } else if (std::isnan(b.raw[i])) { + // no change + } else { + a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); + } + } + return a; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + if (std::isnan(a.raw[i])) { + a.raw[i] = b.raw[i]; + } else if (std::isnan(b.raw[i])) { + // no change + } else { + a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); + } + } + return a; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Min(hwy::IsFloatTag<T>(), a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Max(hwy::IsFloatTag<T>(), a, b); +} + +// ------------------------------ Neg + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T, size_t N> +HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) { + const DFromV<decltype(v)> d; + return Zero(d) - v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) { + const DFromV<decltype(v)> d; + return Xor(v, SignBit(d)); +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> Neg(Vec128<T, N> v) { + return detail::Neg(hwy::IsFloatTag<T>(), v); +} + +// ------------------------------ Mul/Div + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] *= b.raw[i]; + } + return a; +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a, Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * + static_cast<uint64_t>(b.raw[i])); + } + return a; +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * + static_cast<uint64_t>(b.raw[i])); + } + return a; +} + +} // namespace detail + +// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Mul(hwy::TypeTag<T>(), a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i]; + } + return a; +} + +// Returns the upper 16 bits of a * b in each lane. +template <size_t N> +HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16); + } + return a; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + for (size_t i = 0; i < N; ++i) { + // Cast to uint32_t first to prevent overflow. Otherwise the result of + // uint16_t * uint16_t is in "int" which may overflow. In practice the + // result is the same but this way it is also defined. + a.raw[i] = static_cast<uint16_t>( + (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >> + 16); + } + return a; +} + +template <size_t N> +HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast<int16_t>((2 * a.raw[i] * b.raw[i] + 32768) >> 16); + } + return a; +} + +// Multiplies even lanes (0, 2 ..) and returns the double-wide result. +template <size_t N> +HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + Vec128<int64_t, (N + 1) / 2> mul; + for (size_t i = 0; i < N; i += 2) { + const int64_t a64 = a.raw[i]; + mul.raw[i / 2] = a64 * b.raw[i]; + } + return mul; +} +template <size_t N> +HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + Vec128<uint64_t, (N + 1) / 2> mul; + for (size_t i = 0; i < N; i += 2) { + const uint64_t a64 = a.raw[i]; + mul.raw[i / 2] = a64 * b.raw[i]; + } + return mul; +} + +template <size_t N> +HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + Vec128<int64_t, (N + 1) / 2> mul; + for (size_t i = 0; i < N; i += 2) { + const int64_t a64 = a.raw[i + 1]; + mul.raw[i / 2] = a64 * b.raw[i + 1]; + } + return mul; +} +template <size_t N> +HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + Vec128<uint64_t, (N + 1) / 2> mul; + for (size_t i = 0; i < N; i += 2) { + const uint64_t a64 = a.raw[i + 1]; + mul.raw[i / 2] = a64 * b.raw[i + 1]; + } + return mul; +} + +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) { + for (size_t i = 0; i < N; ++i) { + // Zero inputs are allowed, but callers are responsible for replacing the + // return value with something else (typically using IfThenElse). This check + // avoids a ubsan error. The result is arbitrary. + v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i]; + } + return v; +} + +template <size_t N> +HWY_API Vec128<float, N> AbsDiff(Vec128<float, N> a, Vec128<float, N> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +template <typename T, size_t N> +HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return mul * x + add; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return add - mul * x; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> sub) { + return mul * x - sub; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> sub) { + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { + for (size_t i = 0; i < N; ++i) { + const float half = v.raw[i] * 0.5f; + uint32_t bits; + CopySameSize(&v.raw[i], &bits); + // Initial guess based on log2(f) + bits = 0x5F3759DF - (bits >> 1); + CopySameSize(&bits, &v.raw[i]); + // One Newton-Raphson iteration + v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i])); + } + return v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = std::sqrt(v.raw[i]); + } + return v; +} + +// ------------------------------ Floating-point rounding + +template <typename T, size_t N> +HWY_API Vec128<T, N> Round(Vec128<T, N> v) { + using TI = MakeSigned<T>; + const Vec128<T, N> a = Abs(v); + for (size_t i = 0; i < N; ++i) { + if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN + continue; + } + const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast<TI>(v.raw[i] + bias); + if (rounded == 0) { + v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0}; + continue; + } + const T rounded_f = static_cast<T>(rounded); + // Round to even + if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) { + v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1)); + continue; + } + v.raw[i] = rounded_f; + } + return v; +} + +// Round-to-nearest even. +template <size_t N> +HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) { + using T = float; + using TI = int32_t; + + const Vec128<float, N> abs = Abs(v); + Vec128<int32_t, N> ret; + for (size_t i = 0; i < N; ++i) { + const bool signbit = std::signbit(v.raw[i]); + + if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN + // Check if too large to cast or NaN + if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) { + ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>(); + continue; + } + ret.raw[i] = static_cast<TI>(v.raw[i]); + continue; + } + const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast<TI>(v.raw[i] + bias); + if (rounded == 0) { + ret.raw[i] = 0; + continue; + } + const T rounded_f = static_cast<T>(rounded); + // Round to even + if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) { + ret.raw[i] = rounded - (signbit ? -1 : 1); + continue; + } + ret.raw[i] = rounded; + } + return ret; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) { + using TI = MakeSigned<T>; + const Vec128<T, N> abs = Abs(v); + for (size_t i = 0; i < N; ++i) { + if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN + continue; + } + const TI truncated = static_cast<TI>(v.raw[i]); + if (truncated == 0) { + v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0}; + continue; + } + v.raw[i] = static_cast<T>(truncated); + } + return v; +} + +// Toward +infinity, aka ceiling +template <typename Float, size_t N> +Vec128<Float, N> Ceil(Vec128<Float, N> v) { + constexpr int kMantissaBits = MantissaBits<Float>(); + using Bits = MakeUnsigned<Float>; + const Bits kExponentMask = MaxExponentField<Float>(); + const Bits kMantissaMask = MantissaMask<Float>(); + const Bits kBias = kExponentMask / 2; + + for (size_t i = 0; i < N; ++i) { + const bool positive = v.raw[i] > Float(0.0); + + Bits bits; + CopySameSize(&v.raw[i], &bits); + + const int exponent = + static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) continue; + // |v| <= 1 => 0 or 1. + if (exponent < 0) { + v.raw[i] = positive ? Float{1} : Float{-0.0}; + continue; + } + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) continue; + + // Clear fractional bits and round up + if (positive) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &v.raw[i]); + } + return v; +} + +// Toward -infinity, aka floor +template <typename Float, size_t N> +Vec128<Float, N> Floor(Vec128<Float, N> v) { + constexpr int kMantissaBits = MantissaBits<Float>(); + using Bits = MakeUnsigned<Float>; + const Bits kExponentMask = MaxExponentField<Float>(); + const Bits kMantissaMask = MantissaMask<Float>(); + const Bits kBias = kExponentMask / 2; + + for (size_t i = 0; i < N; ++i) { + const bool negative = v.raw[i] < Float(0.0); + + Bits bits; + CopySameSize(&v.raw[i], &bits); + + const int exponent = + static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) continue; + // |v| <= 1 => -1 or 0. + if (exponent < 0) { + v.raw[i] = negative ? Float(-1.0) : Float(0.0); + continue; + } + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) continue; + + // Clear fractional bits and round down + if (negative) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &v.raw[i]); + } + return v; +} + +// ------------------------------ Floating-point classification + +template <typename T, size_t N> +HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) { + Mask128<T, N> ret; + for (size_t i = 0; i < N; ++i) { + // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. + MakeUnsigned<T> bits; + CopySameSize(&v.raw[i], &bits); + bits += bits; + bits >>= 1; // clear sign bit + // NaN if all exponent bits are set and the mantissa is not zero. + ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>()); + } + return ret; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>()))); +} + +// Returns whether normal/subnormal/zero. +template <typename T, size_t N> +HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + using VI = VFromD<decltype(di)>; + using VU = VFromD<decltype(du)>; + const VU vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VI exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); +} + +// ================================================== COMPARE + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) { + Mask128<T, N> m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]); + } + return m; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator!=(Vec128<T, N> a, Vec128<T, N> b) { + Mask128<T, N> m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]); + } + return m; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { + Mask128<T, N> m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]); + } + return m; +} +template <typename T, size_t N> +HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { + Mask128<T, N> m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]); + } + return m; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { + Mask128<T, N> m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]); + } + return m; +} +template <typename T, size_t N> +HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { + Mask128<T, N> m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]); + } + return m; +} + +// ------------------------------ Lt128 + +// Only makes sense for full vectors of u64. +template <class D> +HWY_API MFromD<D> Lt128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) { + const bool lt = + (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]); + Mask128<uint64_t> ret; + ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt); + return ret; +} + +template <class D> +HWY_API MFromD<D> Lt128Upper(D /* tag */, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const bool lt = a.raw[1] < b.raw[1]; + Mask128<uint64_t> ret; + ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt); + return ret; +} + +// ------------------------------ Eq128 + +// Only makes sense for full vectors of u64. +template <class D> +HWY_API MFromD<D> Eq128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) { + const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0]; + Mask128<uint64_t> ret; + ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq); + return ret; +} + +template <class D> +HWY_API Mask128<uint64_t> Ne128(D /* tag */, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0]; + Mask128<uint64_t> ret; + ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne); + return ret; +} + +template <class D> +HWY_API MFromD<D> Eq128Upper(D /* tag */, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const bool eq = a.raw[1] == b.raw[1]; + Mask128<uint64_t> ret; + ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq); + return ret; +} + +template <class D> +HWY_API MFromD<D> Ne128Upper(D /* tag */, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const bool ne = a.raw[1] != b.raw[1]; + Mask128<uint64_t> ret; + ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne); + return ret; +} + +// ------------------------------ Min128, Max128 (Lt128) + +template <class D> +HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128(d, a, b), a, b); +} + +template <class D> +HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128(d, b, a), a, b); +} + +template <class D> +HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template <class D> +HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template <class D> +HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) { + VFromD<D> v; + CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array + return v; +} + +template <class D> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, + const TFromD<D>* HWY_RESTRICT p) { + return IfThenElseZero(m, LoadU(d, p)); +} + +template <class D> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, + const TFromD<D>* HWY_RESTRICT p) { + return IfThenElse(m, LoadU(d, p), v); +} + +template <class D> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} + +// In some use cases, "load single lane" is sufficient; otherwise avoid this. +template <class D> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +// ------------------------------ Store + +template <class D> +HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { + CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array +} + +template <class D> +HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { + Store(v, d, p); +} + +template <class D> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (m.bits[i]) p[i] = v.raw[i]; + } +} + +// ------------------------------ LoadInterleaved2/3/4 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +// We implement those here because scalar code is likely faster than emulation +// via shuffles. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1) { + alignas(16) T buf0[MaxLanes(d)]; + alignas(16) T buf1[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + buf0[i] = *unaligned++; + buf1[i] = *unaligned++; + } + v0 = Load(d, buf0); + v1 = Load(d, buf1); +} + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + alignas(16) T buf0[MaxLanes(d)]; + alignas(16) T buf1[MaxLanes(d)]; + alignas(16) T buf2[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + buf0[i] = *unaligned++; + buf1[i] = *unaligned++; + buf2[i] = *unaligned++; + } + v0 = Load(d, buf0); + v1 = Load(d, buf1); + v2 = Load(d, buf2); +} + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + alignas(16) T buf0[MaxLanes(d)]; + alignas(16) T buf1[MaxLanes(d)]; + alignas(16) T buf2[MaxLanes(d)]; + alignas(16) T buf3[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + buf0[i] = *unaligned++; + buf1[i] = *unaligned++; + buf2[i] = *unaligned++; + buf3[i] = *unaligned++; + } + v0 = Load(d, buf0); + v1 = Load(d, buf1); + v2 = Load(d, buf2); + v3 = Load(d, buf3); +} + +// ------------------------------ StoreInterleaved2/3/4 + +template <class D> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + for (size_t i = 0; i < MaxLanes(d); ++i) { + *unaligned++ = v0.raw[i]; + *unaligned++ = v1.raw[i]; + } +} + +template <class D> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + for (size_t i = 0; i < MaxLanes(d); ++i) { + *unaligned++ = v0.raw[i]; + *unaligned++ = v1.raw[i]; + *unaligned++ = v2.raw[i]; + } +} + +template <class D> +HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + for (size_t i = 0; i < MaxLanes(d); ++i) { + *unaligned++ = v0.raw[i]; + *unaligned++ = v1.raw[i]; + *unaligned++ = v2.raw[i]; + *unaligned++ = v3.raw[i]; + } +} + +// ------------------------------ Stream +template <class D> +HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { + Store(v, d, aligned); +} + +// ------------------------------ Scatter + +template <class D, typename T = TFromD<D>, typename Offset> +HWY_API void ScatterOffset(VFromD<D> v, D d, T* base, + Vec128<Offset, HWY_MAX_LANES_D(D)> offset) { + static_assert(sizeof(T) == sizeof(Offset), "Index/lane size must match"); + for (size_t i = 0; i < MaxLanes(d); ++i) { + uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i]; + CopyBytes<sizeof(T)>(&v.raw[i], base8); // copy to bytes + } +} + +template <class D, typename T = TFromD<D>, typename Index> +HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, + Vec128<Index, HWY_MAX_LANES_D(D)> index) { + static_assert(sizeof(T) == sizeof(Index), "Index/lane size must match"); + for (size_t i = 0; i < MaxLanes(d); ++i) { + base[index.raw[i]] = v.raw[i]; + } +} + +// ------------------------------ Gather + +template <class D, typename T = TFromD<D>, typename Offset> +HWY_API VFromD<D> GatherOffset(D d, const T* base, + Vec128<Offset, HWY_MAX_LANES_D(D)> offset) { + static_assert(sizeof(T) == sizeof(Offset), "Index/lane size must match"); + VFromD<D> v; + for (size_t i = 0; i < MaxLanes(d); ++i) { + const uint8_t* base8 = + reinterpret_cast<const uint8_t*>(base) + offset.raw[i]; + CopyBytes<sizeof(T)>(base8, &v.raw[i]); // copy from bytes + } + return v; +} + +template <class D, typename T = TFromD<D>, typename Index> +HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, + Vec128<Index, HWY_MAX_LANES_D(D)> index) { + static_assert(sizeof(T) == sizeof(Index), "Index/lane size must match"); + VFromD<D> v; + for (size_t i = 0; i < MaxLanes(d); ++i) { + v.raw[i] = base[index.raw[i]]; + } + return v; +} + +// ================================================== CONVERT + +// ConvertTo and DemoteTo with floating-point input and integer output truncate +// (rounding toward zero). + +template <class DTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)> +HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { + static_assert(sizeof(TFromD<DTo>) > sizeof(TFrom), "Not promoting"); + VFromD<DTo> ret; + for (size_t i = 0; i < MaxLanes(d); ++i) { + // For bits Y > X, floatX->floatY and intX->intY are always representable. + ret.raw[i] = static_cast<TFromD<DTo>>(from.raw[i]); + } + return ret; +} + +// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, +// so we overload for TFrom=double and ToT={float,int32_t}. +template <class D, HWY_IF_F32_D(D)> +HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) { + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(d); ++i) { + // Prevent ubsan errors when converting float to narrower integer/float + if (std::isinf(from.raw[i]) || + std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) { + ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>() + : HighestValue<float>(); + continue; + } + ret.raw[i] = static_cast<float>(from.raw[i]); + } + return ret; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) { + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(d); ++i) { + // Prevent ubsan errors when converting int32_t to narrower integer/int32_t + if (std::isinf(from.raw[i]) || + std::fabs(from.raw[i]) > static_cast<double>(HighestValue<int32_t>())) { + ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<int32_t>() + : HighestValue<int32_t>(); + continue; + } + ret.raw[i] = static_cast<int32_t>(from.raw[i]); + } + return ret; +} + +template <class DTo, typename TFrom, size_t N, HWY_IF_SIGNED(TFrom), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)> +HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) { + using TTo = TFromD<DTo>; + static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); + + VFromD<DTo> ret; + for (size_t i = 0; i < N; ++i) { + // Int to int: choose closest value in ToT to `from` (avoids UB) + from.raw[i] = + HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw[i]), LimitsMax<TTo>()); + ret.raw[i] = static_cast<TTo>(from.raw[i]); + } + return ret; +} + +template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom), + HWY_IF_UNSIGNED_D(DTo)> +HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) { + using TTo = TFromD<DTo>; + static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); + + VFromD<DTo> ret; + for (size_t i = 0; i < N; ++i) { + // Int to int: choose closest value in ToT to `from` (avoids UB) + from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>()); + ret.raw[i] = static_cast<TTo>(from.raw[i]); + } + return ret; +} + +template <class DBF16, HWY_IF_BF16_D(DBF16), class VF32> +HWY_API VFromD<DBF16> ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) { + const Repartition<uint32_t, decltype(dbf16)> du32; + const VFromD<decltype(du32)> b_in_lower = ShiftRight<16>(BitCast(du32, b)); + // Avoid OddEven - we want the upper half of `a` even on big-endian systems. + const VFromD<decltype(du32)> a_mask = Set(du32, 0xFFFF0000); + return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower)); +} + +template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V, + HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const RepartitionToWide<decltype(dn)> dw; + const size_t NW = Lanes(dw); + using TN = TFromD<DN>; + const TN min = LimitsMin<TN>(); + const TN max = LimitsMax<TN>(); + VFromD<DN> ret; + for (size_t i = 0; i < NW; ++i) { + ret.raw[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, a.raw[i]), max)); + } + for (size_t i = 0; i < NW; ++i) { + ret.raw[NW + i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, b.raw[i]), max)); + } + return ret; +} + +template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const RepartitionToWide<decltype(dn)> dw; + const size_t NW = Lanes(dw); + using TN = TFromD<DN>; + const TN max = LimitsMax<TN>(); + VFromD<DN> ret; + for (size_t i = 0; i < NW; ++i) { + ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max)); + } + for (size_t i = 0; i < NW; ++i) { + ret.raw[NW + i] = static_cast<TN>(HWY_MIN(b.raw[i], max)); + } + return ret; +} + +template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V, + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { + return ReorderDemote2To(dn, a, b); +} + +template <class DN, HWY_IF_BF16_D(DN), class V, HWY_IF_F32_D(DFromV<V>), + HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { + const RebindToUnsigned<DFromV<decltype(a)>> du32; + const size_t NW = Lanes(du32); + VFromD<Repartition<uint16_t, DN>> ret; + + const auto a_bits = BitCast(du32, a); + const auto b_bits = BitCast(du32, b); + + for (size_t i = 0; i < NW; ++i) { + ret.raw[i] = static_cast<uint16_t>(a_bits.raw[i] >> 16); + } + for (size_t i = 0; i < NW; ++i) { + ret.raw[NW + i] = static_cast<uint16_t>(b_bits.raw[i] >> 16); + } + return BitCast(dn, ret); +} + +namespace detail { + +HWY_INLINE void StoreU16ToF16(const uint16_t val, + hwy::float16_t* HWY_RESTRICT to) { + CopySameSize(&val, to); +} + +HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) { + uint16_t bits16; + CopySameSize(from, &bits16); + return bits16; +} + +} // namespace detail + +template <class D, HWY_IF_F32_D(D), size_t N> +HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<float16_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + const uint16_t bits16 = detail::U16FromF16(&v.raw[i]); + const uint32_t sign = static_cast<uint32_t>(bits16 >> 15); + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = + (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024)); + ret.raw[i] = sign ? -subnormal : subnormal; + continue; + } + + // Normalized: convert the representation directly (faster than + // ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + CopySameSize(&bits32, &ret.raw[i]); + } + return ret; +} + +template <class D, HWY_IF_F32_D(D), size_t N> +HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = F32FromBF16(v.raw[i]); + } + return ret; +} + +template <class D, HWY_IF_F16_D(D), size_t N> +HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + uint32_t bits32; + CopySameSize(&v.raw[i], &bits32); + const uint32_t sign = bits32 >> 31; + const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; + const uint32_t mantissa32 = bits32 & 0x7FFFFF; + + const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15); + + // Tiny or zero => zero. + if (exp < -24) { + ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]); + continue; + } + + uint32_t biased_exp16, mantissa16; + + // exp = [-24, -15] => subnormal + if (exp < -14) { + biased_exp16 = 0; + const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp); + HWY_DASSERT(1 <= sub_exp && sub_exp < 11); + mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) + + (mantissa32 >> (13 + sub_exp))); + } else { + // exp = [-14, 15] + biased_exp16 = static_cast<uint32_t>(exp + 15); + HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); + mantissa16 = mantissa32 >> 13; + } + + HWY_DASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + HWY_DASSERT(bits16 < 0x10000); + const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe + detail::StoreU16ToF16(narrowed, &ret.raw[i]); + } + return ret; +} + +template <class D, HWY_IF_BF16_D(D), size_t N> +HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = BF16FromF32(v.raw[i]); + } + return ret; +} + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename TFrom, typename DTo> +HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/, + Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { + using ToT = TFromD<DTo>; + static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size"); + VFromD<DTo> ret; + constexpr size_t N = HWY_MAX_LANES_D(DTo); + for (size_t i = 0; i < N; ++i) { + // float## -> int##: return closest representable value. We cannot exactly + // represent LimitsMax<ToT> in TFrom, so use double. + const double f = static_cast<double>(from.raw[i]); + if (std::isinf(from.raw[i]) || + std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) { + ret.raw[i] = + std::signbit(from.raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>(); + continue; + } + ret.raw[i] = static_cast<ToT>(from.raw[i]); + } + return ret; +} + +template <typename TFrom, typename DTo> +HWY_API VFromD<DTo> ConvertTo(hwy::NonFloatTag /*tag*/, DTo /* tag */, + Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { + using ToT = TFromD<DTo>; + static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size"); + VFromD<DTo> ret; + constexpr size_t N = HWY_MAX_LANES_D(DTo); + for (size_t i = 0; i < N; ++i) { + // int## -> float##: no check needed + ret.raw[i] = static_cast<ToT>(from.raw[i]); + } + return ret; +} + +} // namespace detail + +template <class DTo, typename TFrom> +HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { + return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from); +} + +template <size_t N> +HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) { + return DemoteTo(Simd<uint8_t, N, 0>(), v); +} + +// ------------------------------ Truncations + +template <class D, HWY_IF_U8_D(D), size_t N> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF); + } + return ret; +} + +template <class D, HWY_IF_U16_D(D), size_t N> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF); + } + return ret; +} + +template <class D, HWY_IF_U32_D(D), size_t N> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu); + } + return ret; +} + +template <class D, HWY_IF_U8_D(D), size_t N> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF); + } + return ret; +} + +template <class D, HWY_IF_U16_D(D), size_t N> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF); + } + return ret; +} + +template <class D, HWY_IF_U8_D(D), size_t N> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint16_t, N> v) { + VFromD<D> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF); + } + return ret; +} + +#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#else +#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#endif + +template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) { + const RepartitionToWide<decltype(dn)> dw; + const size_t NW = Lanes(dw); + using TW = TFromD<decltype(dw)>; + using TN = TFromD<decltype(dn)>; + VFromD<DN> ret; + constexpr TW max_val{LimitsMax<TN>()}; + + for (size_t i = 0; i < NW; ++i) { + ret.raw[i] = static_cast<TN>(a.raw[i] & max_val); + } + for (size_t i = 0; i < NW; ++i) { + ret.raw[NW + i] = static_cast<TN>(b.raw[i] & max_val); + } + return ret; +} + +// ================================================== COMBINE + +template <typename T, size_t N> +HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { + Vec128<T, N / 2> ret; + CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw); + return ret; +} + +template <class D> +HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { + return LowerHalf(v); +} + +template <class D> +HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { + VFromD<D> ret; + CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw); + return ret; +} + +template <class D> +HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) { + const Half<decltype(d)> dh; + VFromD<D> ret; // zero-initialized + CopyBytes<dh.MaxBytes()>(v.raw, ret.raw); + return ret; +} + +template <class D, class VH = VFromD<Half<D>>> +HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { + const Half<decltype(d)> dh; + VFromD<D> ret; + CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]); + CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]); + return ret; +} + +template <class D> +HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + VFromD<D> ret; + CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]); + CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]); + return ret; +} + +template <class D> +HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + VFromD<D> ret; + CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]); + CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]); + return ret; +} + +template <class D> +HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + VFromD<D> ret; + CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]); + CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]); + return ret; +} + +template <class D> +HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + VFromD<D> ret; + CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]); + CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]); + return ret; +} + +template <class D> +HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(dh); ++i) { + ret.raw[i] = lo.raw[2 * i]; + } + for (size_t i = 0; i < MaxLanes(dh); ++i) { + ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i]; + } + return ret; +} + +template <class D> +HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(dh); ++i) { + ret.raw[i] = lo.raw[2 * i + 1]; + } + for (size_t i = 0; i < MaxLanes(dh); ++i) { + ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1]; + } + return ret; +} + +// ------------------------------ CombineShiftRightBytes +template <int kBytes, class D> +HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { + VFromD<D> ret; + const uint8_t* HWY_RESTRICT lo8 = + reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw); + uint8_t* HWY_RESTRICT ret8 = + reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); + CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8); + CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes); + return ret; +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D> +HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + VFromD<D> ret; + uint8_t* HWY_RESTRICT ret8 = + reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); + ZeroBytes<kBytes>(ret8); + CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes); + return ret; +} + +template <int kBytes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { + return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftLeftLanes + +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); +} + +template <int kLanes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) { + return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D> +HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + VFromD<D> ret; + const uint8_t* HWY_RESTRICT v8 = + reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw); + uint8_t* HWY_RESTRICT ret8 = + reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); + CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8); + ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes); + return ret; +} + +// ------------------------------ ShiftRightLanes +template <int kLanes, class D> +HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); + return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); +} + +// ================================================== SWIZZLE + +template <typename T, size_t N> +HWY_API T GetLane(Vec128<T, N> v) { + return v.raw[0]; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) { + v.raw[i] = t; + return v; +} + +template <typename T, size_t N> +HWY_API T ExtractLane(Vec128<T, N> v, size_t i) { + return v.raw[i]; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { + for (size_t i = 0; i < N; i += 2) { + v.raw[i + 1] = v.raw[i]; + } + return v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { + for (size_t i = 0; i < N; i += 2) { + v.raw[i] = v.raw[i + 1]; + } + return v; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) { + for (size_t i = 0; i < N; i += 2) { + odd.raw[i] = even.raw[i]; + } + return odd; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks +template <typename T, size_t N> +HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { + return v; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template <typename T, size_t N> +struct Indices128 { + MakeSigned<T> raw[N]; +}; + +template <class D, typename TI, size_t N> +HWY_API Indices128<TFromD<D>, N> IndicesFromVec(D d, Vec128<TI, N> vec) { + static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size must match"); + Indices128<TFromD<D>, N> ret; + CopyBytes<d.MaxBytes()>(vec.raw, ret.raw); + return ret; +} + +template <class D, typename TI> +HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( + D d, const TI* idx) { + return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { + Vec128<T, N> ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = v.raw[idx.raw[i]]; + } + return ret; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, + Indices128<T, N> idx) { + using TI = MakeSigned<T>; + Vec128<T, N> ret; + constexpr TI kVecLaneIdxMask = static_cast<TI>(N - 1); + for (size_t i = 0; i < N; ++i) { + const auto src_idx = idx.raw[i]; + const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask; + ret.raw[i] = (src_idx < static_cast<TI>(N)) ? a.raw[masked_src_lane_idx] + : b.raw[masked_src_lane_idx]; + } + return ret; +} + +// ------------------------------ ReverseBlocks +template <class D> +HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { + return v; // Single block: no change +} + +// ------------------------------ Reverse + +template <class D> +HWY_API VFromD<D> Reverse(D d, VFromD<D> v) { + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(d); ++i) { + ret.raw[i] = v.raw[MaxLanes(d) - 1 - i]; + } + return ret; +} + +// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +template <class D> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(d); i += 2) { + ret.raw[i + 0] = v.raw[i + 1]; + ret.raw[i + 1] = v.raw[i + 0]; + } + return ret; +} + +template <class D> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(d); i += 4) { + ret.raw[i + 0] = v.raw[i + 3]; + ret.raw[i + 1] = v.raw[i + 2]; + ret.raw[i + 2] = v.raw[i + 1]; + ret.raw[i + 3] = v.raw[i + 0]; + } + return ret; +} + +template <class D> +HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(d); i += 8) { + ret.raw[i + 0] = v.raw[i + 7]; + ret.raw[i + 1] = v.raw[i + 6]; + ret.raw[i + 2] = v.raw[i + 5]; + ret.raw[i + 3] = v.raw[i + 4]; + ret.raw[i + 4] = v.raw[i + 3]; + ret.raw[i + 5] = v.raw[i + 2]; + ret.raw[i + 6] = v.raw[i + 1]; + ret.raw[i + 7] = v.raw[i + 0]; + } + return ret; +} + +// ================================================== BLOCKWISE + +// ------------------------------ Shuffle* + +// Swap 32-bit halves in 64-bit halves. +template <typename T, size_t N> +HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Reverse2(DFromV<decltype(v)>(), v); +} + +// Swap 64-bit halves +template <typename T> +HWY_API Vec128<T> Shuffle1032(Vec128<T> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit"); + Vec128<T> ret; + ret.raw[3] = v.raw[1]; + ret.raw[2] = v.raw[0]; + ret.raw[1] = v.raw[3]; + ret.raw[0] = v.raw[2]; + return ret; +} +template <typename T> +HWY_API Vec128<T> Shuffle01(Vec128<T> v) { + static_assert(sizeof(T) == 8, "Only for 64-bit"); + return Reverse2(DFromV<decltype(v)>(), v); +} + +// Rotate right 32 bits +template <typename T> +HWY_API Vec128<T> Shuffle0321(Vec128<T> v) { + Vec128<T> ret; + ret.raw[3] = v.raw[0]; + ret.raw[2] = v.raw[3]; + ret.raw[1] = v.raw[2]; + ret.raw[0] = v.raw[1]; + return ret; +} + +// Rotate left 32 bits +template <typename T> +HWY_API Vec128<T> Shuffle2103(Vec128<T> v) { + Vec128<T> ret; + ret.raw[3] = v.raw[2]; + ret.raw[2] = v.raw[1]; + ret.raw[1] = v.raw[0]; + ret.raw[0] = v.raw[3]; + return ret; +} + +template <typename T> +HWY_API Vec128<T> Shuffle0123(Vec128<T> v) { + return Reverse4(DFromV<decltype(v)>(), v); +} + +// ------------------------------ Broadcast +template <int kLane, typename T, size_t N> +HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = v.raw[kLane]; + } + return v; +} + +// ------------------------------ TableLookupBytes, TableLookupBytesOr0 + +template <typename T, size_t N, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> v, + Vec128<TI, NI> indices) { + const uint8_t* HWY_RESTRICT v_bytes = + reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw); + const uint8_t* HWY_RESTRICT idx_bytes = + reinterpret_cast<const uint8_t*>(indices.raw); + Vec128<TI, NI> ret; + uint8_t* HWY_RESTRICT ret_bytes = + reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); + for (size_t i = 0; i < NI * sizeof(TI); ++i) { + const size_t idx = idx_bytes[i]; + // Avoid out of bounds reads. + ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0; + } + return ret; +} + +template <typename T, size_t N, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytesOr0(Vec128<T, N> v, + Vec128<TI, NI> indices) { + // Same as TableLookupBytes, which already returns 0 if out of bounds. + return TableLookupBytes(v, indices); +} + +// ------------------------------ InterleaveLower/InterleaveUpper + +template <typename T, size_t N> +HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { + Vec128<T, N> ret; + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[2 * i + 0] = a.raw[i]; + ret.raw[2 * i + 1] = b.raw[i]; + } + return ret; +} + +// Additional overload for the optional tag. +template <class V> +HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) { + return InterleaveLower(a, b); +} + +template <class D> +HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { + const Half<decltype(d)> dh; + VFromD<D> ret; + for (size_t i = 0; i < MaxLanes(dh); ++i) { + ret.raw[2 * i + 0] = a.raw[MaxLanes(dh) + i]; + ret.raw[2 * i + 1] = b.raw[MaxLanes(dh) + i]; + } + return ret; +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== MASK + +template <class D> +HWY_API bool AllFalse(D d, MFromD<D> mask) { + typename MFromD<D>::Raw or_sum = 0; + for (size_t i = 0; i < MaxLanes(d); ++i) { + or_sum |= mask.bits[i]; + } + return or_sum == 0; +} + +template <class D> +HWY_API bool AllTrue(D d, MFromD<D> mask) { + constexpr uint64_t kAll = LimitsMax<typename MFromD<D>::Raw>(); + uint64_t and_sum = kAll; + for (size_t i = 0; i < MaxLanes(d); ++i) { + and_sum &= mask.bits[i]; + } + return and_sum == kAll; +} + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + MFromD<D> m; + for (size_t i = 0; i < MaxLanes(d); ++i) { + const size_t bit = size_t{1} << (i & 7); + const size_t idx_byte = i >> 3; + m.bits[i] = MFromD<D>::FromBool((bits[idx_byte] & bit) != 0); + } + return m; +} + +// `p` points to at least 8 writable bytes. +template <class D> +HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { + bits[0] = 0; + if (MaxLanes(d) > 8) bits[1] = 0; // MaxLanes(d) <= 16, so max two bytes + for (size_t i = 0; i < MaxLanes(d); ++i) { + const size_t bit = size_t{1} << (i & 7); + const size_t idx_byte = i >> 3; + if (mask.bits[i]) { + bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit); + } + } + return MaxLanes(d) > 8 ? 2 : 1; +} + +template <class D> +HWY_API size_t CountTrue(D d, MFromD<D> mask) { + size_t count = 0; + for (size_t i = 0; i < MaxLanes(d); ++i) { + count += mask.bits[i] != 0; + } + return count; +} + +template <class D> +HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (mask.bits[i] != 0) return i; + } + HWY_DASSERT(false); + return 0; +} + +template <class D> +HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (mask.bits[i] != 0) return static_cast<intptr_t>(i); + } + return intptr_t{-1}; +} + +template <class D> +HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { + for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) { + if (mask.bits[i] != 0) return static_cast<size_t>(i); + } + HWY_DASSERT(false); + return 0; +} + +template <class D> +HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { + for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) { + if (mask.bits[i] != 0) return i; + } + return intptr_t{-1}; +} + +// ------------------------------ Compress + +template <typename T> +struct CompressIsPartition { + enum { value = (sizeof(T) != 1) }; +}; + +template <typename T, size_t N> +HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { + size_t count = 0; + Vec128<T, N> ret; + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + for (size_t i = 0; i < N; ++i) { + if (!mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + HWY_DASSERT(count == N); + return ret; +} + +// ------------------------------ Expand + +// Could also just allow generic_ops-inl.h to implement these, but use our +// simple implementation below to ensure the test is correct. +#ifdef HWY_NATIVE_EXPAND +#undef HWY_NATIVE_EXPAND +#else +#define HWY_NATIVE_EXPAND +#endif + +template <typename T, size_t N> +HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) { + size_t in_pos = 0; + Vec128<T, N> ret; + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i]) { + ret.raw[i] = v.raw[in_pos++]; + } else { + ret.raw[i] = T(); // zero, also works for float16_t + } + } + return ret; +} + +// ------------------------------ LoadExpand + +template <class D> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + size_t in_pos = 0; + VFromD<D> ret; + for (size_t i = 0; i < Lanes(d); ++i) { + if (mask.bits[i]) { + ret.raw[i] = unaligned[in_pos++]; + } else { + ret.raw[i] = TFromD<D>(); // zero, also works for float16_t + } + } + return ret; +} + +// ------------------------------ CompressNot +template <typename T, size_t N> +HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { + size_t count = 0; + Vec128<T, N> ret; + for (size_t i = 0; i < N; ++i) { + if (!mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + HWY_DASSERT(count == N); + return ret; +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, + Mask128<uint64_t> /* m */) { + return v; +} + +// ------------------------------ CompressBits +template <typename T, size_t N> +HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, + const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits)); +} + +// ------------------------------ CompressStore + +// generic_ops-inl defines the 8-bit versions. +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + size_t count = 0; + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (mask.bits[i]) { + unaligned[count++] = v.raw[i]; + } + } + return count; +} + +// ------------------------------ CompressBlendedStore +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + return CompressStore(v, mask, d, unaligned); +} + +// ------------------------------ CompressBitsStore +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + const MFromD<D> mask = LoadMaskBits(d, bits); + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ WidenMulPairwiseAdd + +template <class D, HWY_IF_F32_D(D), class VBF16> +HWY_API VFromD<D> WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) { + const Rebind<uint32_t, decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 + // Avoid ZipLower/Upper so this also works on big-endian systems. + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return Mul(BitCast(df32, ae), BitCast(df32, be)) + Mul(BitCast(df32, ao), BitCast(df32, bo)); +} + +template <class D, HWY_IF_I32_D(D), class VI16> +HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) { + using VI32 = VFromD<decltype(d32)>; + // Manual sign extension requires two shifts for even lanes. + const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a))); + const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b))); + const VI32 ao = ShiftRight<16>(BitCast(d32, a)); + const VI32 bo = ShiftRight<16>(BitCast(d32, b)); + return Add(Mul(ae, be), Mul(ao, bo)); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template <class D, HWY_IF_F32_D(D), size_t N, class VBF16> +HWY_API VFromD<D> ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b, + const Vec128<float, N> sum0, + Vec128<float, N>& sum1) { + const Rebind<uint32_t, decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 + // Avoid ZipLower/Upper so this also works on big-endian systems. + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +template <class D, HWY_IF_I32_D(D), size_t N, class VI16> +HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b, + const Vec128<int32_t, N> sum0, + Vec128<int32_t, N>& sum1) { + using VI32 = VFromD<decltype(d32)>; + // Manual sign extension requires two shifts for even lanes. + const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a))); + const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b))); + const VI32 ao = ShiftRight<16>(BitCast(d32, a)); + const VI32 bo = ShiftRight<16>(BitCast(d32, b)); + sum1 = Add(Mul(ao, bo), sum1); + return Add(Mul(ae, be), sum0); +} + +// ------------------------------ RearrangeToOddPlusEven +template <class VW> +HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) { + return Add(sum0, sum1); +} + +// ================================================== REDUCTIONS + +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { + T sum = T{0}; + for (size_t i = 0; i < MaxLanes(d); ++i) { + sum += v.raw[i]; + } + return Set(d, sum); +} +template <class D, typename T = TFromD<D>> +HWY_API T ReduceSum(D d, VFromD<D> v) { + T sum = T{0}; + for (size_t i = 0; i < MaxLanes(d); ++i) { + sum += v.raw[i]; + } + return sum; +} +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) { + T min = HighestValue<T>(); + for (size_t i = 0; i < MaxLanes(d); ++i) { + min = HWY_MIN(min, v.raw[i]); + } + return Set(d, min); +} +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) { + T max = LowestValue<T>(); + for (size_t i = 0; i < MaxLanes(d); ++i) { + max = HWY_MAX(max, v.raw[i]); + } + return Set(d, max); +} + +// ================================================== OPS WITH DEPENDENCIES + +// ------------------------------ MulEven/Odd 64x64 (UpperHalf) + +HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) { + alignas(16) uint64_t mul[2]; + mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); + return Load(Full128<uint64_t>(), mul); +} + +HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) { + alignas(16) uint64_t mul[2]; + const Half<Full128<uint64_t>> d2; + mul[0] = + Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); + return Load(Full128<uint64_t>(), mul); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/generic_ops-inl.h b/third_party/highway/hwy/ops/generic_ops-inl.h new file mode 100644 index 0000000000..74074e08fa --- /dev/null +++ b/third_party/highway/hwy/ops/generic_ops-inl.h @@ -0,0 +1,3190 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Target-independent types/functions defined after target-specific ops. + +#include "hwy/base.h" + +// Define detail::Shuffle1230 etc, but only when viewing the current header; +// normally this is included via highway.h, which includes ops/*.h. +#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) +#include "hwy/detect_targets.h" +#include "hwy/ops/emu128-inl.h" +#endif // HWY_IDE + +// Relies on the external include guard in highway.h. +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>. +template <class V> +using LaneType = decltype(GetLane(V())); + +// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return +// type of functions that do not take a vector argument, or as an argument type +// if the function only has a template argument for D, or for explicit type +// names instead of auto. This may be a built-in type. +template <class D> +using Vec = decltype(Zero(D())); + +// Mask type. Useful as the return type of functions that do not take a mask +// argument, or as an argument type if the function only has a template argument +// for D, or for explicit type names instead of auto. +template <class D> +using Mask = decltype(MaskFromVec(Zero(D()))); + +// Returns the closest value to v within [lo, hi]. +template <class V> +HWY_API V Clamp(const V v, const V lo, const V hi) { + return Min(Max(lo, v), hi); +} + +// CombineShiftRightBytes (and -Lanes) are not available for the scalar target, +// and RVV has its own implementation of -Lanes. +#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV + +template <size_t kLanes, class D> +HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); + static_assert(kBytes < 16, "Shift count is per-block"); + return CombineShiftRightBytes<kBytes>(d, hi, lo); +} + +#endif + +// Returns lanes with the most significant bit set and all other bits zero. +template <class D> +HWY_API Vec<D> SignBit(D d) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Set(du, SignMask<TFromD<D>>())); +} + +// Returns quiet NaN. +template <class D> +HWY_API Vec<D> NaN(D d) { + const RebindToSigned<D> di; + // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus + // mantissa MSB (to indicate quiet) would be sufficient. + return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>())); +} + +// Returns positive infinity. +template <class D> +HWY_API Vec<D> Inf(D d) { + const RebindToUnsigned<D> du; + using T = TFromD<D>; + using TU = TFromD<decltype(du)>; + const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>()); + return BitCast(d, Set(du, max_x2 >> 1)); +} + +// ------------------------------ ZeroExtendResizeBitCast + +// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 +// target is in emu128-inl.h, and the implementation of +// detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h +#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR +namespace detail { + +#if HWY_HAVE_SCALABLE +template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<kFromVectSize> /* from_size_tag */, + hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from, + VFromD<DFrom> v) { + using TFrom = TFromD<DFrom>; + using TTo = TFromD<DTo>; + using TResize = UnsignedFromSize<HWY_MIN(sizeof(TFrom), sizeof(TTo))>; + + const Repartition<TResize, decltype(d_from)> d_resize_from; + const Repartition<TResize, decltype(d_to)> d_resize_to; + return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, Lanes(d_resize_from)), + ResizeBitCast(d_resize_to, v))); +} +#else // target that uses fixed-size vectors +// Truncating or same-size resizing cast: same as ResizeBitCast +template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom, + HWY_IF_LANES_LE(kToVectSize, kFromVectSize)> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<kFromVectSize> /* from_size_tag */, + hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/, + VFromD<DFrom> v) { + return ResizeBitCast(d_to, v); +} + +// Resizing cast to vector that has twice the number of lanes of the source +// vector +template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom, + HWY_IF_LANES(kToVectSize, kFromVectSize * 2)> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<kFromVectSize> /* from_size_tag */, + hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from, + VFromD<DFrom> v) { + const Twice<decltype(d_from)> dt_from; + return BitCast(d_to, ZeroExtendVector(dt_from, v)); +} + +// Resizing cast to vector that has more than twice the number of lanes of the +// source vector +template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom, + HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<kFromVectSize> /* from_size_tag */, + hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/, + VFromD<DFrom> v) { + using TFrom = TFromD<DFrom>; + constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); + const Repartition<TFrom, decltype(d_to)> d_resize_to; + return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), + ResizeBitCast(d_resize_to, v))); +} +#endif // HWY_HAVE_SCALABLE + +} // namespace detail +#endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR + +template <class DTo, class DFrom> +HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, + VFromD<DFrom> v) { + return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(), + hwy::SizeTag<d_to.MaxBytes()>(), d_to, + d_from, v); +} + +// ------------------------------ SafeFillN + +template <class D, typename T = TFromD<D>> +HWY_API void SafeFillN(const size_t num, const T value, D d, + T* HWY_RESTRICT to) { +#if HWY_MEM_OPS_MIGHT_FAULT + (void)d; + for (size_t i = 0; i < num; ++i) { + to[i] = value; + } +#else + BlendedStore(Set(d, value), FirstN(d, num), d, to); +#endif +} + +// ------------------------------ SafeCopyN + +template <class D, typename T = TFromD<D>> +HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, + T* HWY_RESTRICT to) { +#if HWY_MEM_OPS_MIGHT_FAULT + (void)d; + for (size_t i = 0; i < num; ++i) { + to[i] = from[i]; + } +#else + const Mask<D> mask = FirstN(d, num); + BlendedStore(MaskedLoad(mask, d, from), mask, d, to); +#endif +} + +// ------------------------------ BitwiseIfThenElse +#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#else +#define HWY_NATIVE_BITWISE_IF_THEN_ELSE +#endif + +template <class V> +HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { + return Or(And(mask, yes), AndNot(mask, no)); +} + +#endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE + +// "Include guard": skip if native instructions are available. The generic +// implementation is currently shared between x86_* and wasm_*, and is too large +// to duplicate. + +#if HWY_IDE || \ + (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +// ------------------------------ LoadInterleaved2 + +template <class D, HWY_IF_LANES_GT_D(D, 1)> +HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1) { + const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] + const VFromD<D> B = LoadU(d, unaligned + Lanes(d)); + v0 = ConcatEven(d, B, A); + v1 = ConcatOdd(d, B, A); +} + +template <class D, HWY_IF_LANES_D(D, 1)> +HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); +} + +// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) + +namespace detail { + +#if HWY_IDE +template <class V> +HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { + return a; +} +template <class V> +HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { + return a; +} +template <class V> +HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { + return a; +} +#endif // HWY_IDE + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE void LoadTransposedBlocks3(D d, + const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& A, VFromD<D>& B, + VFromD<D>& C) { + constexpr size_t kN = MaxLanes(d); + A = LoadU(d, unaligned + 0 * kN); + B = LoadU(d, unaligned + 1 * kN); + C = LoadU(d, unaligned + 2 * kN); +} + +} // namespace detail + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + const RebindToUnsigned<decltype(d)> du; + using V = VFromD<D>; + // Compact notation so these fit on one line: 12 := v1[2]. + V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 + V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 + V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + // Compress all lanes belonging to v0 into consecutive lanes. + constexpr uint8_t Z = 0x80; + alignas(16) static constexpr uint8_t kIdx_v0A[16] = { + 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v0B[16] = { + Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v0C[16] = { + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13}; + alignas(16) static constexpr uint8_t kIdx_v1A[16] = { + 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v1B[16] = { + Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v1C[16] = { + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14}; + alignas(16) static constexpr uint8_t kIdx_v2A[16] = { + 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v2B[16] = { + Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v2C[16] = { + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15}; + const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); + const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); + const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); + const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); + const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); + const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); + const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); + const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); + const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); + v0 = Xor3(v0L, v0M, v0U); + v1 = Xor3(v1L, v1M, v1U); + v2 = Xor3(v2L, v2M, v2U); +} + +// 8-bit lanes x8 +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + const RebindToUnsigned<decltype(d)> du; + using V = VFromD<D>; + V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] + V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] + V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + // Compress all lanes belonging to v0 into consecutive lanes. + constexpr uint8_t Z = 0x80; + alignas(16) static constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5}; + alignas(16) static constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6}; + alignas(16) static constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7}; + const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); + const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); + const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); + const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); + const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); + const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); + const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); + const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); + const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); + v0 = Xor3(v0L, v0M, v0U); + v1 = Xor3(v1L, v1M, v1U); + v2 = Xor3(v2L, v2M, v2U); +} + +// 16-bit lanes x8 +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + const RebindToUnsigned<decltype(d)> du; + const Repartition<uint8_t, decltype(du)> du8; + using V = VFromD<D>; + V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] + V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] + V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + // Compress all lanes belonging to v0 into consecutive lanes. Same as above, + // but each element of the array contains a byte index for a byte of a lane. + constexpr uint8_t Z = 0x80; + alignas(16) static constexpr uint8_t kIdx_v0A[16] = { + 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v0B[16] = { + Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v0C[16] = { + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B}; + alignas(16) static constexpr uint8_t kIdx_v1A[16] = { + 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v1B[16] = { + Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v1C[16] = { + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D}; + alignas(16) static constexpr uint8_t kIdx_v2A[16] = { + 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v2B[16] = { + Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z}; + alignas(16) static constexpr uint8_t kIdx_v2C[16] = { + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F}; + const V v0L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v0A))); + const V v0M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v0B))); + const V v0U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v0C))); + const V v1L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v1A))); + const V v1M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v1B))); + const V v1U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v1C))); + const V v2L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v2A))); + const V v2M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v2B))); + const V v2U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v2C))); + v0 = Xor3(v0L, v0M, v0U); + v1 = Xor3(v1L, v1M, v1U); + v2 = Xor3(v2L, v2M, v2U); +} + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + using V = VFromD<D>; + V A; // v0[1] v2[0] v1[0] v0[0] + V B; // v1[2] v0[2] v2[1] v1[1] + V C; // v2[3] v1[3] v0[3] v2[2] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + + const V vxx_02_03_xx = OddEven(C, B); + v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx); + + // Shuffle2301 takes the upper/lower halves of the output from one input, so + // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use + // OddEven because it may have higher throughput than Shuffle. + const V vxx_xx_10_11 = OddEven(A, B); + const V v12_13_xx_xx = OddEven(B, C); + v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx); + + const V vxx_20_21_xx = OddEven(B, A); + v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C); +} + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + VFromD<D> A; // v1[0] v0[0] + VFromD<D> B; // v0[1] v2[0] + VFromD<D> C; // v2[1] v1[1] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + v0 = OddEven(B, A); + v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A); + v2 = OddEven(C, B); +} + +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); +} + +// ------------------------------ LoadInterleaved4 + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE void LoadTransposedBlocks4(D d, + const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& vA, VFromD<D>& vB, + VFromD<D>& vC, VFromD<D>& vD) { + constexpr size_t kN = MaxLanes(d); + vA = LoadU(d, unaligned + 0 * kN); + vB = LoadU(d, unaligned + 1 * kN); + vC = LoadU(d, unaligned + 2 * kN); + vD = LoadU(d, unaligned + 3 * kN); +} + +} // namespace detail + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)> +HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + const Repartition<uint64_t, decltype(d)> d64; + using V64 = VFromD<decltype(d64)>; + using V = VFromD<D>; + // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. + // Here int[i] means the four interleaved values of the i-th 4-tuple and + // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). + V vA; // int[13..10] int[3..0] + V vB; // int[17..14] int[7..4] + V vC; // int[1b..18] int[b..8] + V vD; // int[1f..1c] int[f..c] + detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); + + // For brevity, the comments only list the lower block (upper = lower + 0x10) + const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] + const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] + const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] + const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a] + + const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] + const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] + const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] + const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] + + const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] + const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] + const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] + const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] + + v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); + v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); + v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); + v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); +} + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)> +HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + // In the last step, we interleave by half of the block size, which is usually + // 8 bytes but half that for 8-bit x8 vectors. + using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>; + const Repartition<TW, decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + + // (Comments are for 256-bit vectors.) + // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD. + VFromD<D> vA; // v3210[9]v3210[8] v3210[1]v3210[0] + VFromD<D> vB; // v3210[b]v3210[a] v3210[3]v3210[2] + VFromD<D> vC; // v3210[d]v3210[c] v3210[5]v3210[4] + VFromD<D> vD; // v3210[f]v3210[e] v3210[7]v3210[6] + detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); + + const VFromD<D> va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0] + const VFromD<D> vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4] + const VFromD<D> vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1] + const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5] + + const VW v10_b830 = // v10[b..8] v10[3..0] + BitCast(dw, InterleaveLower(d, va820, vb931)); + const VW v10_fc74 = // v10[f..c] v10[7..4] + BitCast(dw, InterleaveLower(d, vec64, vfd75)); + const VW v32_b830 = // v32[b..8] v32[3..0] + BitCast(dw, InterleaveUpper(d, va820, vb931)); + const VW v32_fc74 = // v32[f..c] v32[7..4] + BitCast(dw, InterleaveUpper(d, vec64, vfd75)); + + v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); + v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); + v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); + v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); +} + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)> +HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + using V = VFromD<D>; + V vA; // v3210[4] v3210[0] + V vB; // v3210[5] v3210[1] + V vC; // v3210[6] v3210[2] + V vD; // v3210[7] v3210[3] + detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); + const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] + const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] + const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] + const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] + + v0 = InterleaveLower(d, v10e, v10o); + v1 = InterleaveUpper(d, v10e, v10o); + v2 = InterleaveLower(d, v32e, v32o); + v3 = InterleaveUpper(d, v32e, v32o); +} + +template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)> +HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + VFromD<D> vA, vB, vC, vD; + detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); + v0 = InterleaveLower(d, vA, vC); + v1 = InterleaveUpper(d, vA, vC); + v2 = InterleaveLower(d, vB, vD); + v3 = InterleaveUpper(d, vB, vD); +} + +// Any T x1 +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); + v3 = LoadU(d, unaligned + 3); +} + +// ------------------------------ StoreInterleaved2 + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + constexpr size_t kN = MaxLanes(d); + StoreU(A, d, unaligned + 0 * kN); + StoreU(B, d, unaligned + 1 * kN); +} + +} // namespace detail + +// >= 128 bit vector +template <class D, HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] + const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2] + detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); +} + +// <= 64 bits +template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API void StoreInterleaved2(V part0, V part1, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const Twice<decltype(d)> d2; + const auto v0 = ZeroExtendVector(d2, part0); + const auto v1 = ZeroExtendVector(d2, part1); + const auto v10 = InterleaveLower(d2, v0, v1); + StoreU(v10, d2, unaligned); +} + +// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, +// TableLookupBytes) + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + constexpr size_t kN = MaxLanes(d); + StoreU(A, d, unaligned + 0 * kN); + StoreU(B, d, unaligned + 1 * kN); + StoreU(C, d, unaligned + 2 * kN); +} + +} // namespace detail + +// >= 128-bit vector, 8-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const auto k5 = Set(du, TU{5}); + const auto k6 = Set(du, TU{6}); + + // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): + // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes + // to their place, with 0x80 so lanes to be filled from other vectors are 0 + // to enable blending by ORing together. + alignas(16) static constexpr uint8_t tbl_v0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + // The interleaved vectors will be named A, B, C; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A0 = LoadDup128(du, tbl_v0); + const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5) + const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. + const VFromD<D> A = BitCast(d, A0 | A1 | A2); + + // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5] + const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6.. + const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5 + const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5. + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const VFromD<D> B = BitCast(d, B0 | B1 | B2); + + // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10] + const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B. + const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B.. + const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A + const auto C0 = TableLookupBytesOr0(v0, shuf_C0); + const auto C1 = TableLookupBytesOr0(v1, shuf_C1); + const auto C2 = TableLookupBytesOr0(v2, shuf_C2); + const VFromD<D> C = BitCast(d, C0 | C1 | C2); + + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// >= 128-bit vector, 16-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const Repartition<uint8_t, decltype(d)> du8; + const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)}); + const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)}); + + // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): + // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be + // filled from other vectors are 0 for blending. Note that these are byte + // indices for 16-bit lanes. + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, + 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; + alignas(16) static constexpr uint8_t tbl_v2[16] = { + 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, + 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; + + // The interleaved vectors will be named A, B, C; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0. + // .2..1..0 + const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); + const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0.. + + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); + const VFromD<D> A = BitCast(d, A0 | A1 | A2); + + // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2] + const auto shuf_B0 = shuf_A1 + k3; // 5..4..3. + const auto shuf_B1 = shuf_A2 + k3; // ..4..3.. + const auto shuf_B2 = shuf_A0 + k2; // .4..3..2 + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const VFromD<D> B = BitCast(d, B0 | B1 | B2); + + // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] + const auto shuf_C0 = shuf_B1 + k3; // ..7..6.. + const auto shuf_C1 = shuf_B2 + k3; // .7..6..5 + const auto shuf_C2 = shuf_B0 + k2; // 7..6..5. + const auto C0 = TableLookupBytesOr0(v0, shuf_C0); + const auto C1 = TableLookupBytesOr0(v1, shuf_C1); + const auto C2 = TableLookupBytesOr0(v2, shuf_C2); + const VFromD<D> C = BitCast(d, C0 | C1 | C2); + + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// >= 128-bit vector, 32-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RepartitionToWide<decltype(d)> dw; + + const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1); + const VFromD<D> v01_v20 = OddEven(v0, v2); + // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0) + const VFromD<D> A = BitCast( + d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20))); + + const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1); + const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0); + const VFromD<D> v21_v11 = OddEven(v2, v1_321); + const VFromD<D> v12_v02 = OddEven(v1_321, v0_32); + // B: v1[2],v0[2], v2[1],v1[1] + const VFromD<D> B = BitCast( + d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02))); + + // Notation refers to the upper 2 lanes of the vector for InterleaveUpper. + const VFromD<D> v23_v13 = OddEven(v2, v1_321); + const VFromD<D> v03_v22 = OddEven(v0, v2); + // C: v2[3],v1[3],v0[3], v2[2] + const VFromD<D> C = BitCast( + d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13))); + + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// >= 128-bit vector, 64-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const VFromD<D> A = InterleaveLower(d, v0, v1); + const VFromD<D> B = OddEven(v0, v2); + const VFromD<D> C = InterleaveUpper(d, v1, v2); + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// 64-bit vector, 8-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)> +HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and first result. + constexpr size_t kFullN = 16 / sizeof(TFromD<D>); + const Full128<uint8_t> du; + const Full128<TFromD<D>> d_full; + const auto k5 = Set(du, uint8_t{5}); + const auto k6 = Set(du, uint8_t{6}); + + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + + // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): + // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be + // filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_v0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + // The interleaved vectors will be named A, B, C; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A0 = Load(du, tbl_v0); + const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB) + const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. + const auto A = BitCast(d_full, A0 | A1 | A2); + StoreU(A, d_full, unaligned + 0 * kFullN); + + // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] + const auto shuf_B0 = shuf_A2 + k6; // ..7..6.. + const auto shuf_B1 = shuf_A0 + k5; // .7..6..5 + const auto shuf_B2 = shuf_A1 + k5; // 7..6..5. + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const VFromD<D> B{BitCast(d_full, B0 | B1 | B2).raw}; + StoreU(B, d, unaligned + 1 * kFullN); +} + +// 64-bit vector, 16-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)> +HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, D dh, + TFromD<D>* HWY_RESTRICT unaligned) { + const Twice<D> d_full; + const Full128<uint8_t> du8; + const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)}); + const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)}); + + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + + // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right): + // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes + // to their place, with 0x80 so lanes to be filled from other vectors are 0 + // to enable blending by ORing together. + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, + 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; + alignas(16) static constexpr uint8_t tbl_v2[16] = { + 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, + 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; + + // The interleaved vectors will be named A, B; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0. + // .2..1..0 + const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); + const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0.. + + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); + const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2); + StoreU(A, d_full, unaligned); + + // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2] + const auto shuf_B0 = shuf_A1 + k3; // ..3. + const auto shuf_B1 = shuf_A2 + k3; // .3.. + const auto shuf_B2 = shuf_A0 + k2; // 3..2 + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const VFromD<decltype(d_full)> B = BitCast(d_full, B0 | B1 | B2); + StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full)); +} + +// 64-bit vector, 32-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // (same code as 128-bit vector, 64-bit lanes) + const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1); + const VFromD<D> v01_v20 = OddEven(v0, v2); + const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2); + constexpr size_t kN = MaxLanes(d); + StoreU(v10_v00, d, unaligned + 0 * kN); + StoreU(v01_v20, d, unaligned + 1 * kN); + StoreU(v21_v11, d, unaligned + 2 * kN); +} + +// 64-bit lanes are handled by the N=1 case below. + +// <= 32-bit vector, 8-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4), + HWY_IF_LANES_GT_D(D, 1)> +HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and result. + const Full128<uint8_t> du; + const Full128<TFromD<D>> d_full; + + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + + // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 + // so lanes to be filled from other vectors are 0 to enable blending by ORing + // together. + alignas(16) static constexpr uint8_t tbl_v0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, + 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + // The interleaved vector will be named A; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A0 = Load(du, tbl_v0); + const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0); + const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0.. + const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2); + alignas(16) TFromD<D> buf[MaxLanes(d_full)]; + StoreU(A, d_full, buf); + CopyBytes<d.MaxBytes() * 3>(buf, unaligned); +} + +// 32-bit vector, 16-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)> +HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and result. + const Full128<uint8_t> du8; + const Full128<TFromD<D>> d_full; + + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + + // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 + // so lanes to be filled from other vectors are 0 to enable blending by ORing + // together. + alignas(16) static constexpr uint8_t tbl_v2[16] = { + 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, + 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; + // The interleaved vector will be named A; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A2 = // ..1..0.. + Load(du8, tbl_v2); + const auto shuf_A1 = // ...1..0. + CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); + const auto shuf_A0 = // ....1..0 + CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0.. + const auto A = BitCast(d_full, A0 | A1 | A2); + alignas(16) TFromD<D> buf[MaxLanes(d_full)]; + StoreU(A, d_full, buf); + CopyBytes<d.MaxBytes() * 3>(buf, unaligned); +} + +// Single-element vector, any lane size: just store directly +template <class D, HWY_IF_LANES_D(D, 1)> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); +} + +// ------------------------------ StoreInterleaved4 + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC, + VFromD<D> vD, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + constexpr size_t kN = MaxLanes(d); + StoreU(vA, d, unaligned + 0 * kN); + StoreU(vB, d, unaligned + 1 * kN); + StoreU(vC, d, unaligned + 2 * kN); + StoreU(vD, d, unaligned + 3 * kN); +} + +} // namespace detail + +// >= 128-bit vector, 8..32-bit lanes +template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RepartitionToWide<decltype(d)> dw; + const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0] + const auto v32L = ZipLower(dw, v2, v3); + const auto v10U = ZipUpper(dw, v0, v1); + const auto v32U = ZipUpper(dw, v2, v3); + // The interleaved vectors are vA, vB, vC, vD. + const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210 + const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L)); + const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U)); + const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U)); + detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); +} + +// >= 128-bit vector, 64-bit lanes +template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> +HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // The interleaved vectors are vA, vB, vC, vD. + const VFromD<D> vA = InterleaveLower(d, v0, v1); // v1[0] v0[0] + const VFromD<D> vB = InterleaveLower(d, v2, v3); + const VFromD<D> vC = InterleaveUpper(d, v0, v1); + const VFromD<D> vD = InterleaveUpper(d, v2, v3); + detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); +} + +// 64-bit vector, 8..32-bit lanes +template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)> +HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, VFromD<D> part3, D /* tag */, + TFromD<D>* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Full128<TFromD<D>> d_full; + const RepartitionToWide<decltype(d_full)> dw; + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + const VFromD<decltype(d_full)> v3{part3.raw}; + const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0] + const auto v32 = ZipLower(dw, v2, v3); + const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32)); + const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32)); + StoreU(A, d_full, unaligned); + StoreU(B, d_full, unaligned + MaxLanes(d_full)); +} + +// 64-bit vector, 64-bit lane +template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)> +HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, VFromD<D> part3, D /* tag */, + TFromD<D>* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Full128<TFromD<D>> d_full; + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + const VFromD<decltype(d_full)> v3{part3.raw}; + const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0] + const auto B = InterleaveLower(d_full, v2, v3); + StoreU(A, d_full, unaligned); + StoreU(B, d_full, unaligned + MaxLanes(d_full)); +} + +// <= 32-bit vectors +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1, + VFromD<D> part2, VFromD<D> part3, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Full128<TFromD<D>> d_full; + const RepartitionToWide<decltype(d_full)> dw; + const VFromD<decltype(d_full)> v0{part0.raw}; + const VFromD<decltype(d_full)> v1{part1.raw}; + const VFromD<decltype(d_full)> v2{part2.raw}; + const VFromD<decltype(d_full)> v3{part3.raw}; + const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0] + const auto v32 = ZipLower(dw, v2, v3); + const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32)); + alignas(16) TFromD<D> buf[MaxLanes(d_full)]; + StoreU(v3210, d_full, buf); + CopyBytes<d.MaxBytes() * 4>(buf, unaligned); +} + +#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED + +// ------------------------------ Integer AbsDiff and SumsOf8AbsDiff + +#if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_INTEGER_ABS_DIFF +#undef HWY_NATIVE_INTEGER_ABS_DIFF +#else +#define HWY_NATIVE_INTEGER_ABS_DIFF +#endif + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V AbsDiff(V a, V b) { + return Sub(Max(a, b), Min(a, b)); +} + +#endif // HWY_NATIVE_INTEGER_ABS_DIFF + +#if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF +#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF +#else +#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF +#endif + +template <class V, HWY_IF_U8_D(DFromV<V>), + HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))> +HWY_API Vec<Repartition<uint64_t, DFromV<V>>> SumsOf8AbsDiff(V a, V b) { + return SumsOf8(AbsDiff(a, b)); +} + +#endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF + +// ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64 + +#if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB +#undef HWY_NATIVE_I32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I32_SATURATED_ADDSUB +#endif + +template <class V, HWY_IF_I32_D(DFromV<V>)> +HWY_API V SaturatedAdd(V a, V b) { + const DFromV<decltype(a)> d; + const auto sum = Add(a, b); + const auto overflow_mask = + MaskFromVec(BroadcastSignBit(AndNot(Xor(a, b), Xor(a, sum)))); + const auto overflow_result = + Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>())); + return IfThenElse(overflow_mask, overflow_result, sum); +} + +template <class V, HWY_IF_I32_D(DFromV<V>)> +HWY_API V SaturatedSub(V a, V b) { + const DFromV<decltype(a)> d; + const auto diff = Sub(a, b); + const auto overflow_mask = + MaskFromVec(BroadcastSignBit(And(Xor(a, b), Xor(a, diff)))); + const auto overflow_result = + Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>())); + return IfThenElse(overflow_mask, overflow_result, diff); +} + +#endif // HWY_NATIVE_I32_SATURATED_ADDSUB + +#if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB +#undef HWY_NATIVE_I64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I64_SATURATED_ADDSUB +#endif + +template <class V, HWY_IF_I64_D(DFromV<V>)> +HWY_API V SaturatedAdd(V a, V b) { + const DFromV<decltype(a)> d; + const auto sum = Add(a, b); + const auto overflow_mask = + MaskFromVec(BroadcastSignBit(AndNot(Xor(a, b), Xor(a, sum)))); + const auto overflow_result = + Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); + return IfThenElse(overflow_mask, overflow_result, sum); +} + +template <class V, HWY_IF_I64_D(DFromV<V>)> +HWY_API V SaturatedSub(V a, V b) { + const DFromV<decltype(a)> d; + const auto diff = Sub(a, b); + const auto overflow_mask = + MaskFromVec(BroadcastSignBit(And(Xor(a, b), Xor(a, diff)))); + const auto overflow_result = + Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); + return IfThenElse(overflow_mask, overflow_result, diff); +} + +#endif // HWY_NATIVE_I64_SATURATED_ADDSUB + +#if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB +#undef HWY_NATIVE_U32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U32_SATURATED_ADDSUB +#endif + +template <class V, HWY_IF_U32_D(DFromV<V>)> +HWY_API V SaturatedAdd(V a, V b) { + return Add(a, Min(b, Not(a))); +} + +template <class V, HWY_IF_U32_D(DFromV<V>)> +HWY_API V SaturatedSub(V a, V b) { + return Sub(a, Min(a, b)); +} + +#endif // HWY_NATIVE_U32_SATURATED_ADDSUB + +#if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB +#undef HWY_NATIVE_U64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U64_SATURATED_ADDSUB +#endif + +template <class V, HWY_IF_U64_D(DFromV<V>)> +HWY_API V SaturatedAdd(V a, V b) { + return Add(a, Min(b, Not(a))); +} + +template <class V, HWY_IF_U64_D(DFromV<V>)> +HWY_API V SaturatedSub(V a, V b) { + return Sub(a, Min(a, b)); +} + +#endif // HWY_NATIVE_U64_SATURATED_ADDSUB + +// ------------------------------ Unsigned to signed demotions + +template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), + class V2 = VFromD<Rebind<TFromV<V>, DN>>, + hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr, + HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))> +HWY_API VFromD<DN> DemoteTo(DN dn, V v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const RebindToUnsigned<decltype(dn)> dn_u; + + // First, do a signed to signed demotion. This will convert any values + // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a + // negative value. + const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v)); + + // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>() + // using an unsigned Min operation. + const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>()); + + return BitCast( + dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); +} + +#if HWY_TARGET != HWY_SCALAR || HWY_IDE +template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const DFromV<decltype(a)> d; + const RebindToSigned<decltype(d)> di; + const RebindToUnsigned<decltype(dn)> dn_u; + + // First, do a signed to signed demotion. This will convert any values + // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a + // negative value. + const auto i2i_demote_result = + ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b)); + + // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>() + // using an unsigned Min operation. + const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>()); + + return BitCast( + dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); +} +#endif + +// ------------------------------ OrderedTruncate2To + +#if HWY_IDE || \ + (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE)) + +#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#else +#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#endif + +// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) +#if HWY_TARGET != HWY_SCALAR || HWY_IDE +template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) { + return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); +} +#endif // HWY_TARGET != HWY_SCALAR +#endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +#if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_LEADING_ZERO_COUNT +#undef HWY_NATIVE_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_LEADING_ZERO_COUNT +#endif + +namespace detail { + +template <class D, HWY_IF_U32_D(D)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { + const RebindToFloat<decltype(d)> df; +#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 + const RebindToSigned<decltype(d)> di; + const Repartition<int16_t, decltype(d)> di16; + + // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed + // by a unsigned right shift of the uint32_t bit representation of the + // floating point values by 23, followed by an int16_t Min + // operation as we are only interested in the biased exponent that would + // result from a uint32_t to float conversion. + + // An int32_t to float vector conversion is also much more efficient on + // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion + // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2 + // requires multiple instructions whereas an int32_t to float vector + // conversion can be carried out using a single instruction on + // SSE2/SSSE3/SSE4/AVX2. + + const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v))); + return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)), + BitCast(di16, Set(d, 158)))); +#else + const auto f32_bits = BitCast(d, ConvertTo(df, v)); + return BitCast(d, ShiftRight<23>(f32_bits)); +#endif +} + +template <class V, HWY_IF_U32_D(DFromV<V>)> +HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) { + // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but + // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647. + const DFromV<decltype(v)> d; + const RebindToFloat<decltype(d)> df; +#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 + const RebindToSigned<decltype(d)> d_src; +#else + const RebindToUnsigned<decltype(d)> d_src; +#endif + const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v))); + return ShiftRight<23>(f32_bits); +} + +template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { + const Rebind<uint32_t, decltype(d)> du32; + const auto f32_biased_exp_as_u32 = + I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); + return TruncateTo(d, f32_biased_exp_as_u32); +} + +#if HWY_TARGET != HWY_SCALAR +template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { + const Half<decltype(d)> dh; + const Rebind<uint32_t, decltype(dh)> du32; + + const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); + const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); + + const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); + const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); +#if HWY_TARGET <= HWY_SSE2 + const RebindToSigned<decltype(du32)> di32; + const RebindToSigned<decltype(d)> di; + return BitCast(d, + OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32), + BitCast(di32, hi_f32_biased_exp_as_u32))); +#else + return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32, + hi_f32_biased_exp_as_u32); +#endif +} +#endif // HWY_TARGET != HWY_SCALAR + +template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { + const Rebind<uint32_t, decltype(d)> du32; + const auto f32_biased_exp_as_u32 = + I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); + return U8FromU32(f32_biased_exp_as_u32); +} + +#if HWY_TARGET != HWY_SCALAR +template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4), + HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { + const Half<decltype(d)> dh; + const Rebind<uint32_t, decltype(dh)> du32; + const Repartition<uint16_t, decltype(du32)> du16; + + const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); + const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); + + const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); + const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); + +#if HWY_TARGET <= HWY_SSE2 + const RebindToSigned<decltype(du32)> di32; + const RebindToSigned<decltype(du16)> di16; + const auto f32_biased_exp_as_i16 = + OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32), + BitCast(di32, hi_f32_biased_exp_as_u32)); + return DemoteTo(d, f32_biased_exp_as_i16); +#else + const auto f32_biased_exp_as_u16 = OrderedTruncate2To( + du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32); + return TruncateTo(d, f32_biased_exp_as_u16); +#endif +} + +template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { + const Half<decltype(d)> dh; + const Half<decltype(dh)> dq; + const Rebind<uint32_t, decltype(dq)> du32; + const Repartition<uint16_t, decltype(du32)> du16; + + const auto lo_half = LowerHalf(dh, v); + const auto hi_half = UpperHalf(dh, v); + + const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half)); + const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half)); + const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half)); + const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half)); + + const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0); + const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1); + const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2); + const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3); + +#if HWY_TARGET <= HWY_SSE2 + const RebindToSigned<decltype(du32)> di32; + const RebindToSigned<decltype(du16)> di16; + + const auto lo_f32_biased_exp_as_i16 = + OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0), + BitCast(di32, f32_biased_exp_as_u32_q1)); + const auto hi_f32_biased_exp_as_i16 = + OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2), + BitCast(di32, f32_biased_exp_as_u32_q3)); + return OrderedDemote2To(d, lo_f32_biased_exp_as_i16, + hi_f32_biased_exp_as_i16); +#else + const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To( + du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1); + const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To( + du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3); + return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16, + hi_f32_biased_exp_as_u16); +#endif +} +#endif // HWY_TARGET != HWY_SCALAR + +#if HWY_TARGET == HWY_SCALAR +template <class D> +using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>; +#elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2 +template <class D> +using F32ExpLzcntMinMaxRepartition = Repartition<uint8_t, D>; +#else +template <class D> +using F32ExpLzcntMinMaxRepartition = + Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>; +#endif + +template <class V> +using F32ExpLzcntMinMaxCmpV = VFromD<F32ExpLzcntMinMaxRepartition<DFromV<V>>>; + +template <class V> +HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) { + const DFromV<decltype(v)> d; + const F32ExpLzcntMinMaxRepartition<decltype(d)> d2; + return BitCast(d2, v); +} + +template <class D, HWY_IF_U64_D(D)> +HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { +#if HWY_TARGET == HWY_SCALAR + const uint64_t u64_val = GetLane(v); + const float f32_val = static_cast<float>(u64_val); + uint32_t f32_bits; + CopySameSize(&f32_val, &f32_bits); + return Set(d, static_cast<uint64_t>(f32_bits >> 23)); +#else + const Repartition<uint32_t, decltype(d)> du32; + const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v)); + const auto f32_biased_exp_adj = + IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)), + BitCast(du32, Set(d, 0x0000002000000000u))); + const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj); + + return ShiftRight<32>(BitCast( + d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp), + F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp))))); +#endif +} + +template <class V, HWY_IF_UNSIGNED_V(V)> +HWY_INLINE V UIntToF32BiasedExp(V v) { + const DFromV<decltype(v)> d; + return UIntToF32BiasedExp(d, v); +} + +template <class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { + return v; +} + +template <class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> +HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { + // If v[i] >= 16777216 is true, make sure that the bit at + // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact + // conversion to single-precision floating point is rounded down. + + // This zeroing-out can be accomplished through the AndNot operation below. + return AndNot(ShiftRight<24>(v), v); +} + +} // namespace detail + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V HighestSetBitIndex(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + + const auto f32_biased_exp = detail::UIntToF32BiasedExp( + detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); + return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127}))); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V LeadingZeroCount(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + + constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; + const auto f32_biased_exp = detail::UIntToF32BiasedExp( + detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); + const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp); + + return BitCast(d, + Min(detail::F32ExpLzcntMinMaxBitCast(lz_count), + detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V TrailingZeroCount(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; + using TU = TFromD<decltype(du)>; + + const auto vi = BitCast(di, v); + const auto lowest_bit = BitCast(du, And(vi, Neg(vi))); + + constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; + const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit); + const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127})); + + return BitCast(d, + Min(detail::F32ExpLzcntMinMaxBitCast(tz_count), + detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); +} +#endif // HWY_NATIVE_LEADING_ZERO_COUNT + +// ------------------------------ AESRound + +// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + +// Define for white-box testing, even if native instructions are available. +namespace detail { + +// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with +// Vector Permute Instructions" and the accompanying assembly language +// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan: +// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html . +// +// A brute-force 256 byte table lookup can also be made constant-time, and +// possibly competitive on NEON, but this is more performance-portable +// especially for x86 and large vectors. + +template <class V> // u8 +HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL, + V affine_tblU) { + const DFromV<V> du; + const auto mask = Set(du, uint8_t{0xF}); + + // Change polynomial basis to GF(2^4) + { + alignas(16) static constexpr uint8_t basisL[16] = { + 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2, + 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA}; + alignas(16) static constexpr uint8_t basisU[16] = { + 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C, + 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD}; + const auto sL = And(state, mask); + const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero + const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL); + const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU); + state = Xor(gf4L, gf4U); + } + + // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and + // cause TableLookupBytesOr0 to return 0. + alignas(16) static constexpr uint8_t kZetaInv[16] = { + 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3}; + alignas(16) static constexpr uint8_t kInv[16] = { + 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4}; + const auto tbl = LoadDup128(du, kInv); + const auto sL = And(state, mask); // L=low nibble, U=upper + const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero + const auto sX = Xor(sU, sL); + const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL); + const auto invU = TableLookupBytes(tbl, sU); + const auto invX = TableLookupBytes(tbl, sX); + const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU))); + const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX))); + + const auto affL = TableLookupBytesOr0(affine_tblL, outL); + const auto affU = TableLookupBytesOr0(affine_tblU, outU); + return Xor(affL, affU); +} + +template <class V> // u8 +HWY_INLINE V SubBytes(V state) { + const DFromV<V> du; + // Linear skew (cannot bake 0x63 bias into the table because out* indices + // may have the infinity flag set). + alignas(16) static constexpr uint8_t kAffineL[16] = { + 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0, + 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15}; + alignas(16) static constexpr uint8_t kAffineU[16] = { + 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF, + 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E}; + return Xor(SubBytesMulInverseAndAffineLookup(state, LoadDup128(du, kAffineL), + LoadDup128(du, kAffineU)), + Set(du, uint8_t{0x63})); +} + +template <class V> // u8 +HWY_INLINE V InvSubBytes(V state) { + const DFromV<V> du; + alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvL[16]{ + 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13, + 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7}; + alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvU[16]{ + 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12, + 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA}; + + // Apply the inverse affine transformation + const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)), + Or(ShiftLeft<3>(state), ShiftRight<5>(state)), + Or(ShiftLeft<6>(state), ShiftRight<2>(state))), + Set(du, uint8_t{0x05})); + + // The GF(2^8) multiplicative inverse is computed as follows: + // - Changing the polynomial basis to GF(2^4) + // - Computing the GF(2^4) multiplicative inverse + // - Converting the GF(2^4) multiplicative inverse to the GF(2^8) + // multiplicative inverse through table lookups using the + // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables + return SubBytesMulInverseAndAffineLookup( + b, LoadDup128(du, kGF2P4InvToGF2P8InvL), + LoadDup128(du, kGF2P4InvToGF2P8InvU)); +} + +} // namespace detail + +#endif // HWY_TARGET != HWY_SCALAR + +// "Include guard": skip if native AES instructions are available. +#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) +#if HWY_TARGET != HWY_SCALAR + +namespace detail { + +template <class V> // u8 +HWY_API V ShiftRows(const V state) { + const DFromV<V> du; + alignas(16) static constexpr uint8_t kShiftRow[16] = { + 0, 5, 10, 15, // transposed: state is column major + 4, 9, 14, 3, // + 8, 13, 2, 7, // + 12, 1, 6, 11}; + const auto shift_row = LoadDup128(du, kShiftRow); + return TableLookupBytes(state, shift_row); +} + +template <class V> // u8 +HWY_API V InvShiftRows(const V state) { + const DFromV<V> du; + alignas(16) static constexpr uint8_t kShiftRow[16] = { + 0, 13, 10, 7, // transposed: state is column major + 4, 1, 14, 11, // + 8, 5, 2, 15, // + 12, 9, 6, 3}; + const auto shift_row = LoadDup128(du, kShiftRow); + return TableLookupBytes(state, shift_row); +} + +template <class V> // u8 +HWY_API V GF2P8Mod11BMulBy2(V v) { + const DFromV<V> du; + const RebindToSigned<decltype(du)> di; // can only do signed comparisons + const auto msb = Lt(BitCast(di, v), Zero(di)); + const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B}))); + return Xor(Add(v, v), overflow); // = v*2 in GF(2^8). +} + +template <class V> // u8 +HWY_API V MixColumns(const V state) { + const DFromV<V> du; + // For each column, the rows are the sum of GF(2^8) matrix multiplication by: + // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3. + // 1 2 3 1 // d are on diagonal, no permutation needed. + // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows. + // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301). + alignas(16) static constexpr uint8_t k2301[16] = { + 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + alignas(16) static constexpr uint8_t k1230[16] = { + 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12}; + const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8). + const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301)); + const auto d_s2301 = Xor(d, s2301); + const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)} + const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230)); + return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms +} + +template <class V> // u8 +HWY_API V InvMixColumns(const V state) { + const DFromV<V> du; + // For each column, the rows are the sum of GF(2^8) matrix multiplication by: + // 14 11 13 9 + // 9 14 11 13 + // 13 9 14 11 + // 11 13 9 14 + alignas(16) static constexpr uint8_t k2301[16] = { + 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + alignas(16) static constexpr uint8_t k1230[16] = { + 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12}; + const auto v1230 = LoadDup128(du, k1230); + + const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */ + const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */ + const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */ + const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */ + const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */ + const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */ + const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */ + + const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230)); + const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230)); + const auto sx13_2301_sx9_3012 = + TableLookupBytes(sx13_0123_sx9_1230, LoadDup128(du, k2301)); + return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012); +} + +} // namespace detail + +template <class V> // u8 +HWY_API V AESRound(V state, const V round_key) { + // Intel docs swap the first two steps, but it does not matter because + // ShiftRows is a permutation and SubBytes is independent of lane index. + state = detail::SubBytes(state); + state = detail::ShiftRows(state); + state = detail::MixColumns(state); + state = Xor(state, round_key); // AddRoundKey + return state; +} + +template <class V> // u8 +HWY_API V AESLastRound(V state, const V round_key) { + // LIke AESRound, but without MixColumns. + state = detail::SubBytes(state); + state = detail::ShiftRows(state); + state = Xor(state, round_key); // AddRoundKey + return state; +} + +template <class V> +HWY_API V AESInvMixColumns(V state) { + return detail::InvMixColumns(state); +} + +template <class V> // u8 +HWY_API V AESRoundInv(V state, const V round_key) { + state = detail::InvSubBytes(state); + state = detail::InvShiftRows(state); + state = detail::InvMixColumns(state); + state = Xor(state, round_key); // AddRoundKey + return state; +} + +template <class V> // u8 +HWY_API V AESLastRoundInv(V state, const V round_key) { + // Like AESRoundInv, but without InvMixColumns. + state = detail::InvSubBytes(state); + state = detail::InvShiftRows(state); + state = Xor(state, round_key); // AddRoundKey + return state; +} + +template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)> +HWY_API V AESKeyGenAssist(V v) { + alignas(16) static constexpr uint8_t kRconXorMask[16] = { + 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0}; + alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { + 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12}; + const DFromV<decltype(v)> d; + const auto sub_word_result = detail::SubBytes(v); + const auto rot_word_result = + TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle)); + return Xor(rot_word_result, LoadDup128(d, kRconXorMask)); +} + +// Constant-time implementation inspired by +// https://www.bearssl.org/constanttime.html, but about half the cost because we +// use 64x64 multiplies and 128-bit XORs. +template <class V> +HWY_API V CLMulLower(V a, V b) { + const DFromV<V> d; + static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64"); + const auto k1 = Set(d, 0x1111111111111111ULL); + const auto k2 = Set(d, 0x2222222222222222ULL); + const auto k4 = Set(d, 0x4444444444444444ULL); + const auto k8 = Set(d, 0x8888888888888888ULL); + const auto a0 = And(a, k1); + const auto a1 = And(a, k2); + const auto a2 = And(a, k4); + const auto a3 = And(a, k8); + const auto b0 = And(b, k1); + const auto b1 = And(b, k2); + const auto b2 = And(b, k4); + const auto b3 = And(b, k8); + + auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3)); + auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0)); + auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1)); + auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2)); + m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1))); + m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2))); + m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3))); + m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0))); + return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); +} + +template <class V> +HWY_API V CLMulUpper(V a, V b) { + const DFromV<V> d; + static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64"); + const auto k1 = Set(d, 0x1111111111111111ULL); + const auto k2 = Set(d, 0x2222222222222222ULL); + const auto k4 = Set(d, 0x4444444444444444ULL); + const auto k8 = Set(d, 0x8888888888888888ULL); + const auto a0 = And(a, k1); + const auto a1 = And(a, k2); + const auto a2 = And(a, k4); + const auto a3 = And(a, k8); + const auto b0 = And(b, k1); + const auto b1 = And(b, k2); + const auto b2 = And(b, k4); + const auto b3 = And(b, k8); + + auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3)); + auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0)); + auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1)); + auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2)); + m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1))); + m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2))); + m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3))); + m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0))); + return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); +} + +#endif // HWY_NATIVE_AES +#endif // HWY_TARGET != HWY_SCALAR + +// ------------------------------ PopulationCount + +// "Include guard": skip if native POPCNT-related instructions are available. +#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +// This overload requires vectors to be at least 16 bytes, which is the case +// for LMUL >= 2. +#undef HWY_IF_POPCNT +#if HWY_TARGET == HWY_RVV +#define HWY_IF_POPCNT(D) \ + hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr +#else +// Other targets only have these two overloads which are mutually exclusive, so +// no further conditions are required. +#define HWY_IF_POPCNT(D) void* = nullptr +#endif // HWY_TARGET == HWY_RVV + +template <class V, class D = DFromV<V>, HWY_IF_U8_D(D), + HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)> +HWY_API V PopulationCount(V v) { + const D d; + HWY_ALIGN constexpr uint8_t kLookup[16] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + }; + const auto lo = And(v, Set(d, uint8_t{0xF})); + const auto hi = ShiftRight<4>(v); + const auto lookup = LoadDup128(d, kLookup); + return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo)); +} + +// RVV has a specialization that avoids the Set(). +#if HWY_TARGET != HWY_RVV +// Slower fallback for capped vectors. +template <class V, class D = DFromV<V>, HWY_IF_U8_D(D), + HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API V PopulationCount(V v) { + const D d; + // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 + const V k33 = Set(d, uint8_t{0x33}); + v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55}))); + v = Add(And(ShiftRight<2>(v), k33), And(v, k33)); + return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F})); +} +#endif // HWY_TARGET != HWY_RVV + +template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)> +HWY_API V PopulationCount(V v) { + const D d; + const Repartition<uint8_t, decltype(d)> d8; + const auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); + return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF}))); +} + +template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)> +HWY_API V PopulationCount(V v) { + const D d; + Repartition<uint16_t, decltype(d)> d16; + auto vals = BitCast(d, PopulationCount(BitCast(d16, v))); + return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF}))); +} + +#if HWY_HAVE_INTEGER64 +template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)> +HWY_API V PopulationCount(V v) { + const D d; + Repartition<uint32_t, decltype(d)> d32; + auto vals = BitCast(d, PopulationCount(BitCast(d32, v))); + return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL))); +} +#endif + +#endif // HWY_NATIVE_POPCNT + +// ------------------------------ 8-bit multiplication + +// "Include guard": skip if native 8-bit mul instructions are available. +#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif + +// 8 bit and fits in wider reg: promote +template <class V, HWY_IF_T_SIZE_V(V, 1), + HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)> +HWY_API V operator*(const V a, const V b) { + const DFromV<decltype(a)> d; + const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw; + const RebindToUnsigned<decltype(d)> du; // TruncateTo result + const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input + const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b); + // TruncateTo is cheaper than ConcatEven. + return BitCast(d, TruncateTo(du, BitCast(dwu, mul))); +} + +// 8 bit full reg: promote halves +template <class V, HWY_IF_T_SIZE_V(V, 1), + HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> +HWY_API V operator*(const V a, const V b) { + const DFromV<decltype(a)> d; + const Half<decltype(d)> dh; + const Twice<RepartitionToWide<decltype(dh)>> dw; + const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a)); + const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a)); + const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b)); + const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b)); + const VFromD<decltype(dw)> m0 = a0 * b0; + const VFromD<decltype(dw)> m1 = a1 * b1; + return ConcatEven(d, BitCast(d, m1), BitCast(d, m0)); +} + +#endif // HWY_NATIVE_MUL_8 + +// ------------------------------ 64-bit multiplication + +// "Include guard": skip if native 64-bit mul instructions are available. +#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +// Single-lane i64 or u64 +template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8), + HWY_IF_NOT_FLOAT_V(V)> +HWY_API V operator*(V x, V y) { + const DFromV<V> d; + using T = TFromD<decltype(d)>; + using TU = MakeUnsigned<T>; + const TU xu = static_cast<TU>(GetLane(x)); + const TU yu = static_cast<TU>(GetLane(y)); + return Set(d, static_cast<T>(xu * yu)); +} + +template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64), + HWY_IF_V_SIZE_GT_D(D64, 8)> +HWY_API V operator*(V x, V y) { + RepartitionToNarrow<D64> d32; + auto x32 = BitCast(d32, x); + auto y32 = BitCast(d32, y); + auto lolo = BitCast(d32, MulEven(x32, y32)); + auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y)))); + auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32)); + auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo))); + return BitCast(D64{}, lolo + hi); +} +template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64), + HWY_IF_V_SIZE_GT_D(DI64, 8)> +HWY_API V operator*(V x, V y) { + RebindToUnsigned<DI64> du64; + return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y)); +} + +#endif // HWY_NATIVE_MUL_64 + +// ------------------------------ MulAdd / NegMulAdd + +// "Include guard": skip if native int MulAdd instructions are available. +#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_INT_FMA +#undef HWY_NATIVE_INT_FMA +#else +#define HWY_NATIVE_INT_FMA +#endif + +template <class V, HWY_IF_NOT_FLOAT_V(V)> +HWY_API V MulAdd(V mul, V x, V add) { + return Add(Mul(mul, x), add); +} + +template <class V, HWY_IF_NOT_FLOAT_V(V)> +HWY_API V NegMulAdd(V mul, V x, V add) { + return Sub(add, Mul(mul, x)); +} + +#endif // HWY_NATIVE_INT_FMA + +// ------------------------------ Compress* + +// "Include guard": skip if native 8-bit compress instructions are available. +#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_COMPRESS8 +#undef HWY_NATIVE_COMPRESS8 +#else +#define HWY_NATIVE_COMPRESS8 +#endif + +template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d, + T* unaligned) { + HWY_ALIGN T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8; + T* HWY_RESTRICT pos = unaligned; + + HWY_ALIGN constexpr T table[2048] = { + 0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7, // + 1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7, // + 1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7, // + 2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7, // + 1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7, // + 1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7, // + 2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7, // + 1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7, // + 3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7, // + 1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7, // + 2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7, // + 1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7, // + 1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7, // + 2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7, // + 1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7, // + 3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7, // + 1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7, // + 2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7, // + 1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7, // + 4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7, // + 1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7, // + 2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7, // + 1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7, // + 3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7, // + 1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7, // + 2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7, // + 1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7, // + 1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7, // + 2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7, // + 1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7, // + 3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7, // + 1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7, // + 2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7, // + 1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7, // + 4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7, // + 1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7, // + 2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7, // + 1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7, // + 3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7, // + 1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7, // + 2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7, // + 1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7, // + 5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7, // + 1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7, // + 2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7, // + 1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7, // + 3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7, // + 1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7, // + 2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7, // + 1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7, // + 4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7, // + 1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7, // + 2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7, // + 1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7, // + 3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7, // + 1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7, // + 2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7, // + 1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // + 7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6, // + 1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6, // + 2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6, // + 1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6, // + 3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6, // + 1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6, // + 2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6, // + 1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6, // + 4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6, // + 1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6, // + 2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6, // + 1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6, // + 3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6, // + 1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6, // + 2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6, // + 1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6, // + 5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6, // + 1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6, // + 2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6, // + 1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6, // + 3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6, // + 1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6, // + 2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6, // + 1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6, // + 4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6, // + 1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6, // + 2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6, // + 1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6, // + 3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6, // + 1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6, // + 2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6, // + 1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6, // + 6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5, // + 1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5, // + 2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5, // + 1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5, // + 3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5, // + 1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5, // + 2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5, // + 1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5, // + 4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5, // + 1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5, // + 2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5, // + 1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5, // + 3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5, // + 1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5, // + 2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5, // + 1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5, // + 5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4, // + 1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4, // + 2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4, // + 1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4, // + 3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4, // + 1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4, // + 2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4, // + 1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4, // + 4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3, // + 1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3, // + 2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3, // + 1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3, // + 3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2, // + 1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2, // + 2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1, // + 1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7}; + + for (size_t i = 0; i < Lanes(d); i += 8) { + // Each byte worth of bits is the index of one of 256 8-byte ranges, and its + // population count determines how far to advance the write position. + const size_t bits8 = bits[i / 8]; + const auto indices = Load(d8, table + bits8 * 8); + const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices); + StoreU(compressed, d8, pos); + pos += PopCount(bits8); + } + return static_cast<size_t>(pos - unaligned); +} + +template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) { + uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)]; + (void)StoreMaskBits(d, mask, bits); + return CompressBitsStore(v, bits, d, unaligned); +} + +template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API size_t CompressBlendedStore(V v, M mask, D d, + T* HWY_RESTRICT unaligned) { + HWY_ALIGN T buf[MaxLanes(d)]; + const size_t bytes = CompressStore(v, mask, d, buf); + BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned); + return bytes; +} + +// For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE. +template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)> +HWY_API V Compress(V v, const M mask) { + const DFromV<V> d; + HWY_ALIGN T lanes[MaxLanes(d)]; + (void)CompressStore(v, mask, d, lanes); + return Load(d, lanes); +} + +template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)> +HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + const DFromV<V> d; + HWY_ALIGN T lanes[MaxLanes(d)]; + (void)CompressBitsStore(v, bits, d, lanes); + return Load(d, lanes); +} + +template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)> +HWY_API V CompressNot(V v, M mask) { + return Compress(v, Not(mask)); +} + +#endif // HWY_NATIVE_COMPRESS8 + +// ------------------------------ Expand + +// "Include guard": skip if native 8/16-bit Expand/LoadExpand are available. +// Note that this generic implementation assumes <= 128 bit fixed vectors; +// the SVE and RVV targets provide their own native implementations. +#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE +#ifdef HWY_NATIVE_EXPAND +#undef HWY_NATIVE_EXPAND +#else +#define HWY_NATIVE_EXPAND +#endif + +namespace detail { + +#if HWY_IDE +template <class M> +HWY_INLINE uint64_t BitsFromMask(M /* mask */) { + return 0; +} +#endif // HWY_IDE + +template <size_t N> +HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) { + static_assert(N <= 8, "Should only be called for half-vectors"); + const Simd<uint8_t, N, 0> du8; + HWY_DASSERT(mask_bits < 0x100); + alignas(16) static constexpr uint8_t table[2048] = { + // PrintExpand8x8Tables + 128, 128, 128, 128, 128, 128, 128, 128, // + 0, 128, 128, 128, 128, 128, 128, 128, // + 128, 0, 128, 128, 128, 128, 128, 128, // + 0, 1, 128, 128, 128, 128, 128, 128, // + 128, 128, 0, 128, 128, 128, 128, 128, // + 0, 128, 1, 128, 128, 128, 128, 128, // + 128, 0, 1, 128, 128, 128, 128, 128, // + 0, 1, 2, 128, 128, 128, 128, 128, // + 128, 128, 128, 0, 128, 128, 128, 128, // + 0, 128, 128, 1, 128, 128, 128, 128, // + 128, 0, 128, 1, 128, 128, 128, 128, // + 0, 1, 128, 2, 128, 128, 128, 128, // + 128, 128, 0, 1, 128, 128, 128, 128, // + 0, 128, 1, 2, 128, 128, 128, 128, // + 128, 0, 1, 2, 128, 128, 128, 128, // + 0, 1, 2, 3, 128, 128, 128, 128, // + 128, 128, 128, 128, 0, 128, 128, 128, // + 0, 128, 128, 128, 1, 128, 128, 128, // + 128, 0, 128, 128, 1, 128, 128, 128, // + 0, 1, 128, 128, 2, 128, 128, 128, // + 128, 128, 0, 128, 1, 128, 128, 128, // + 0, 128, 1, 128, 2, 128, 128, 128, // + 128, 0, 1, 128, 2, 128, 128, 128, // + 0, 1, 2, 128, 3, 128, 128, 128, // + 128, 128, 128, 0, 1, 128, 128, 128, // + 0, 128, 128, 1, 2, 128, 128, 128, // + 128, 0, 128, 1, 2, 128, 128, 128, // + 0, 1, 128, 2, 3, 128, 128, 128, // + 128, 128, 0, 1, 2, 128, 128, 128, // + 0, 128, 1, 2, 3, 128, 128, 128, // + 128, 0, 1, 2, 3, 128, 128, 128, // + 0, 1, 2, 3, 4, 128, 128, 128, // + 128, 128, 128, 128, 128, 0, 128, 128, // + 0, 128, 128, 128, 128, 1, 128, 128, // + 128, 0, 128, 128, 128, 1, 128, 128, // + 0, 1, 128, 128, 128, 2, 128, 128, // + 128, 128, 0, 128, 128, 1, 128, 128, // + 0, 128, 1, 128, 128, 2, 128, 128, // + 128, 0, 1, 128, 128, 2, 128, 128, // + 0, 1, 2, 128, 128, 3, 128, 128, // + 128, 128, 128, 0, 128, 1, 128, 128, // + 0, 128, 128, 1, 128, 2, 128, 128, // + 128, 0, 128, 1, 128, 2, 128, 128, // + 0, 1, 128, 2, 128, 3, 128, 128, // + 128, 128, 0, 1, 128, 2, 128, 128, // + 0, 128, 1, 2, 128, 3, 128, 128, // + 128, 0, 1, 2, 128, 3, 128, 128, // + 0, 1, 2, 3, 128, 4, 128, 128, // + 128, 128, 128, 128, 0, 1, 128, 128, // + 0, 128, 128, 128, 1, 2, 128, 128, // + 128, 0, 128, 128, 1, 2, 128, 128, // + 0, 1, 128, 128, 2, 3, 128, 128, // + 128, 128, 0, 128, 1, 2, 128, 128, // + 0, 128, 1, 128, 2, 3, 128, 128, // + 128, 0, 1, 128, 2, 3, 128, 128, // + 0, 1, 2, 128, 3, 4, 128, 128, // + 128, 128, 128, 0, 1, 2, 128, 128, // + 0, 128, 128, 1, 2, 3, 128, 128, // + 128, 0, 128, 1, 2, 3, 128, 128, // + 0, 1, 128, 2, 3, 4, 128, 128, // + 128, 128, 0, 1, 2, 3, 128, 128, // + 0, 128, 1, 2, 3, 4, 128, 128, // + 128, 0, 1, 2, 3, 4, 128, 128, // + 0, 1, 2, 3, 4, 5, 128, 128, // + 128, 128, 128, 128, 128, 128, 0, 128, // + 0, 128, 128, 128, 128, 128, 1, 128, // + 128, 0, 128, 128, 128, 128, 1, 128, // + 0, 1, 128, 128, 128, 128, 2, 128, // + 128, 128, 0, 128, 128, 128, 1, 128, // + 0, 128, 1, 128, 128, 128, 2, 128, // + 128, 0, 1, 128, 128, 128, 2, 128, // + 0, 1, 2, 128, 128, 128, 3, 128, // + 128, 128, 128, 0, 128, 128, 1, 128, // + 0, 128, 128, 1, 128, 128, 2, 128, // + 128, 0, 128, 1, 128, 128, 2, 128, // + 0, 1, 128, 2, 128, 128, 3, 128, // + 128, 128, 0, 1, 128, 128, 2, 128, // + 0, 128, 1, 2, 128, 128, 3, 128, // + 128, 0, 1, 2, 128, 128, 3, 128, // + 0, 1, 2, 3, 128, 128, 4, 128, // + 128, 128, 128, 128, 0, 128, 1, 128, // + 0, 128, 128, 128, 1, 128, 2, 128, // + 128, 0, 128, 128, 1, 128, 2, 128, // + 0, 1, 128, 128, 2, 128, 3, 128, // + 128, 128, 0, 128, 1, 128, 2, 128, // + 0, 128, 1, 128, 2, 128, 3, 128, // + 128, 0, 1, 128, 2, 128, 3, 128, // + 0, 1, 2, 128, 3, 128, 4, 128, // + 128, 128, 128, 0, 1, 128, 2, 128, // + 0, 128, 128, 1, 2, 128, 3, 128, // + 128, 0, 128, 1, 2, 128, 3, 128, // + 0, 1, 128, 2, 3, 128, 4, 128, // + 128, 128, 0, 1, 2, 128, 3, 128, // + 0, 128, 1, 2, 3, 128, 4, 128, // + 128, 0, 1, 2, 3, 128, 4, 128, // + 0, 1, 2, 3, 4, 128, 5, 128, // + 128, 128, 128, 128, 128, 0, 1, 128, // + 0, 128, 128, 128, 128, 1, 2, 128, // + 128, 0, 128, 128, 128, 1, 2, 128, // + 0, 1, 128, 128, 128, 2, 3, 128, // + 128, 128, 0, 128, 128, 1, 2, 128, // + 0, 128, 1, 128, 128, 2, 3, 128, // + 128, 0, 1, 128, 128, 2, 3, 128, // + 0, 1, 2, 128, 128, 3, 4, 128, // + 128, 128, 128, 0, 128, 1, 2, 128, // + 0, 128, 128, 1, 128, 2, 3, 128, // + 128, 0, 128, 1, 128, 2, 3, 128, // + 0, 1, 128, 2, 128, 3, 4, 128, // + 128, 128, 0, 1, 128, 2, 3, 128, // + 0, 128, 1, 2, 128, 3, 4, 128, // + 128, 0, 1, 2, 128, 3, 4, 128, // + 0, 1, 2, 3, 128, 4, 5, 128, // + 128, 128, 128, 128, 0, 1, 2, 128, // + 0, 128, 128, 128, 1, 2, 3, 128, // + 128, 0, 128, 128, 1, 2, 3, 128, // + 0, 1, 128, 128, 2, 3, 4, 128, // + 128, 128, 0, 128, 1, 2, 3, 128, // + 0, 128, 1, 128, 2, 3, 4, 128, // + 128, 0, 1, 128, 2, 3, 4, 128, // + 0, 1, 2, 128, 3, 4, 5, 128, // + 128, 128, 128, 0, 1, 2, 3, 128, // + 0, 128, 128, 1, 2, 3, 4, 128, // + 128, 0, 128, 1, 2, 3, 4, 128, // + 0, 1, 128, 2, 3, 4, 5, 128, // + 128, 128, 0, 1, 2, 3, 4, 128, // + 0, 128, 1, 2, 3, 4, 5, 128, // + 128, 0, 1, 2, 3, 4, 5, 128, // + 0, 1, 2, 3, 4, 5, 6, 128, // + 128, 128, 128, 128, 128, 128, 128, 0, // + 0, 128, 128, 128, 128, 128, 128, 1, // + 128, 0, 128, 128, 128, 128, 128, 1, // + 0, 1, 128, 128, 128, 128, 128, 2, // + 128, 128, 0, 128, 128, 128, 128, 1, // + 0, 128, 1, 128, 128, 128, 128, 2, // + 128, 0, 1, 128, 128, 128, 128, 2, // + 0, 1, 2, 128, 128, 128, 128, 3, // + 128, 128, 128, 0, 128, 128, 128, 1, // + 0, 128, 128, 1, 128, 128, 128, 2, // + 128, 0, 128, 1, 128, 128, 128, 2, // + 0, 1, 128, 2, 128, 128, 128, 3, // + 128, 128, 0, 1, 128, 128, 128, 2, // + 0, 128, 1, 2, 128, 128, 128, 3, // + 128, 0, 1, 2, 128, 128, 128, 3, // + 0, 1, 2, 3, 128, 128, 128, 4, // + 128, 128, 128, 128, 0, 128, 128, 1, // + 0, 128, 128, 128, 1, 128, 128, 2, // + 128, 0, 128, 128, 1, 128, 128, 2, // + 0, 1, 128, 128, 2, 128, 128, 3, // + 128, 128, 0, 128, 1, 128, 128, 2, // + 0, 128, 1, 128, 2, 128, 128, 3, // + 128, 0, 1, 128, 2, 128, 128, 3, // + 0, 1, 2, 128, 3, 128, 128, 4, // + 128, 128, 128, 0, 1, 128, 128, 2, // + 0, 128, 128, 1, 2, 128, 128, 3, // + 128, 0, 128, 1, 2, 128, 128, 3, // + 0, 1, 128, 2, 3, 128, 128, 4, // + 128, 128, 0, 1, 2, 128, 128, 3, // + 0, 128, 1, 2, 3, 128, 128, 4, // + 128, 0, 1, 2, 3, 128, 128, 4, // + 0, 1, 2, 3, 4, 128, 128, 5, // + 128, 128, 128, 128, 128, 0, 128, 1, // + 0, 128, 128, 128, 128, 1, 128, 2, // + 128, 0, 128, 128, 128, 1, 128, 2, // + 0, 1, 128, 128, 128, 2, 128, 3, // + 128, 128, 0, 128, 128, 1, 128, 2, // + 0, 128, 1, 128, 128, 2, 128, 3, // + 128, 0, 1, 128, 128, 2, 128, 3, // + 0, 1, 2, 128, 128, 3, 128, 4, // + 128, 128, 128, 0, 128, 1, 128, 2, // + 0, 128, 128, 1, 128, 2, 128, 3, // + 128, 0, 128, 1, 128, 2, 128, 3, // + 0, 1, 128, 2, 128, 3, 128, 4, // + 128, 128, 0, 1, 128, 2, 128, 3, // + 0, 128, 1, 2, 128, 3, 128, 4, // + 128, 0, 1, 2, 128, 3, 128, 4, // + 0, 1, 2, 3, 128, 4, 128, 5, // + 128, 128, 128, 128, 0, 1, 128, 2, // + 0, 128, 128, 128, 1, 2, 128, 3, // + 128, 0, 128, 128, 1, 2, 128, 3, // + 0, 1, 128, 128, 2, 3, 128, 4, // + 128, 128, 0, 128, 1, 2, 128, 3, // + 0, 128, 1, 128, 2, 3, 128, 4, // + 128, 0, 1, 128, 2, 3, 128, 4, // + 0, 1, 2, 128, 3, 4, 128, 5, // + 128, 128, 128, 0, 1, 2, 128, 3, // + 0, 128, 128, 1, 2, 3, 128, 4, // + 128, 0, 128, 1, 2, 3, 128, 4, // + 0, 1, 128, 2, 3, 4, 128, 5, // + 128, 128, 0, 1, 2, 3, 128, 4, // + 0, 128, 1, 2, 3, 4, 128, 5, // + 128, 0, 1, 2, 3, 4, 128, 5, // + 0, 1, 2, 3, 4, 5, 128, 6, // + 128, 128, 128, 128, 128, 128, 0, 1, // + 0, 128, 128, 128, 128, 128, 1, 2, // + 128, 0, 128, 128, 128, 128, 1, 2, // + 0, 1, 128, 128, 128, 128, 2, 3, // + 128, 128, 0, 128, 128, 128, 1, 2, // + 0, 128, 1, 128, 128, 128, 2, 3, // + 128, 0, 1, 128, 128, 128, 2, 3, // + 0, 1, 2, 128, 128, 128, 3, 4, // + 128, 128, 128, 0, 128, 128, 1, 2, // + 0, 128, 128, 1, 128, 128, 2, 3, // + 128, 0, 128, 1, 128, 128, 2, 3, // + 0, 1, 128, 2, 128, 128, 3, 4, // + 128, 128, 0, 1, 128, 128, 2, 3, // + 0, 128, 1, 2, 128, 128, 3, 4, // + 128, 0, 1, 2, 128, 128, 3, 4, // + 0, 1, 2, 3, 128, 128, 4, 5, // + 128, 128, 128, 128, 0, 128, 1, 2, // + 0, 128, 128, 128, 1, 128, 2, 3, // + 128, 0, 128, 128, 1, 128, 2, 3, // + 0, 1, 128, 128, 2, 128, 3, 4, // + 128, 128, 0, 128, 1, 128, 2, 3, // + 0, 128, 1, 128, 2, 128, 3, 4, // + 128, 0, 1, 128, 2, 128, 3, 4, // + 0, 1, 2, 128, 3, 128, 4, 5, // + 128, 128, 128, 0, 1, 128, 2, 3, // + 0, 128, 128, 1, 2, 128, 3, 4, // + 128, 0, 128, 1, 2, 128, 3, 4, // + 0, 1, 128, 2, 3, 128, 4, 5, // + 128, 128, 0, 1, 2, 128, 3, 4, // + 0, 128, 1, 2, 3, 128, 4, 5, // + 128, 0, 1, 2, 3, 128, 4, 5, // + 0, 1, 2, 3, 4, 128, 5, 6, // + 128, 128, 128, 128, 128, 0, 1, 2, // + 0, 128, 128, 128, 128, 1, 2, 3, // + 128, 0, 128, 128, 128, 1, 2, 3, // + 0, 1, 128, 128, 128, 2, 3, 4, // + 128, 128, 0, 128, 128, 1, 2, 3, // + 0, 128, 1, 128, 128, 2, 3, 4, // + 128, 0, 1, 128, 128, 2, 3, 4, // + 0, 1, 2, 128, 128, 3, 4, 5, // + 128, 128, 128, 0, 128, 1, 2, 3, // + 0, 128, 128, 1, 128, 2, 3, 4, // + 128, 0, 128, 1, 128, 2, 3, 4, // + 0, 1, 128, 2, 128, 3, 4, 5, // + 128, 128, 0, 1, 128, 2, 3, 4, // + 0, 128, 1, 2, 128, 3, 4, 5, // + 128, 0, 1, 2, 128, 3, 4, 5, // + 0, 1, 2, 3, 128, 4, 5, 6, // + 128, 128, 128, 128, 0, 1, 2, 3, // + 0, 128, 128, 128, 1, 2, 3, 4, // + 128, 0, 128, 128, 1, 2, 3, 4, // + 0, 1, 128, 128, 2, 3, 4, 5, // + 128, 128, 0, 128, 1, 2, 3, 4, // + 0, 128, 1, 128, 2, 3, 4, 5, // + 128, 0, 1, 128, 2, 3, 4, 5, // + 0, 1, 2, 128, 3, 4, 5, 6, // + 128, 128, 128, 0, 1, 2, 3, 4, // + 0, 128, 128, 1, 2, 3, 4, 5, // + 128, 0, 128, 1, 2, 3, 4, 5, // + 0, 1, 128, 2, 3, 4, 5, 6, // + 128, 128, 0, 1, 2, 3, 4, 5, // + 0, 128, 1, 2, 3, 4, 5, 6, // + 128, 0, 1, 2, 3, 4, 5, 6, // + 0, 1, 2, 3, 4, 5, 6, 7}; + return LoadU(du8, table + mask_bits * 8); +} + +} // namespace detail + +// Half vector of bytes: one table lookup +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { + const DFromV<decltype(v)> d; + + const uint64_t mask_bits = detail::BitsFromMask(mask); + const Vec128<uint8_t, N> indices = + detail::IndicesForExpandFromBits<N>(mask_bits); + return BitCast(d, TableLookupBytesOr0(v, indices)); +} + +// Full vector of bytes: two table lookups +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { + const Full128<T> d; + const RebindToUnsigned<decltype(d)> du; + const Half<decltype(du)> duh; + const Vec128<uint8_t> vu = BitCast(du, v); + + const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t maskL = mask_bits & 0xFF; + const uint64_t maskH = mask_bits >> 8; + + // We want to skip past the v bytes already consumed by idxL. There is no + // instruction for shift-reg by variable bytes. Storing v itself would work + // but would involve a store-load forwarding stall. We instead shuffle using + // loaded indices. multishift_epi64_epi8 would also help, but if we have that, + // we probably also have native 8-bit Expand. + alignas(16) static constexpr uint8_t iota[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; + const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL)); + const VFromD<decltype(duh)> vL = LowerHalf(duh, vu); + const VFromD<decltype(duh)> vH = + LowerHalf(duh, TableLookupBytesOr0(vu, shift)); + + const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL); + const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH); + + const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL); + const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH); + return BitCast(d, Combine(du, expandH, expandL)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + + const Rebind<uint8_t, decltype(d)> du8; + const uint64_t mask_bits = detail::BitsFromMask(mask); + + // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply + // the nibble trick used below because not all indices fit within one lane. + alignas(16) static constexpr uint8_t table[2048] = { + // PrintExpand16x8ByteTables + 128, 128, 128, 128, 128, 128, 128, 128, // + 0, 128, 128, 128, 128, 128, 128, 128, // + 128, 0, 128, 128, 128, 128, 128, 128, // + 0, 2, 128, 128, 128, 128, 128, 128, // + 128, 128, 0, 128, 128, 128, 128, 128, // + 0, 128, 2, 128, 128, 128, 128, 128, // + 128, 0, 2, 128, 128, 128, 128, 128, // + 0, 2, 4, 128, 128, 128, 128, 128, // + 128, 128, 128, 0, 128, 128, 128, 128, // + 0, 128, 128, 2, 128, 128, 128, 128, // + 128, 0, 128, 2, 128, 128, 128, 128, // + 0, 2, 128, 4, 128, 128, 128, 128, // + 128, 128, 0, 2, 128, 128, 128, 128, // + 0, 128, 2, 4, 128, 128, 128, 128, // + 128, 0, 2, 4, 128, 128, 128, 128, // + 0, 2, 4, 6, 128, 128, 128, 128, // + 128, 128, 128, 128, 0, 128, 128, 128, // + 0, 128, 128, 128, 2, 128, 128, 128, // + 128, 0, 128, 128, 2, 128, 128, 128, // + 0, 2, 128, 128, 4, 128, 128, 128, // + 128, 128, 0, 128, 2, 128, 128, 128, // + 0, 128, 2, 128, 4, 128, 128, 128, // + 128, 0, 2, 128, 4, 128, 128, 128, // + 0, 2, 4, 128, 6, 128, 128, 128, // + 128, 128, 128, 0, 2, 128, 128, 128, // + 0, 128, 128, 2, 4, 128, 128, 128, // + 128, 0, 128, 2, 4, 128, 128, 128, // + 0, 2, 128, 4, 6, 128, 128, 128, // + 128, 128, 0, 2, 4, 128, 128, 128, // + 0, 128, 2, 4, 6, 128, 128, 128, // + 128, 0, 2, 4, 6, 128, 128, 128, // + 0, 2, 4, 6, 8, 128, 128, 128, // + 128, 128, 128, 128, 128, 0, 128, 128, // + 0, 128, 128, 128, 128, 2, 128, 128, // + 128, 0, 128, 128, 128, 2, 128, 128, // + 0, 2, 128, 128, 128, 4, 128, 128, // + 128, 128, 0, 128, 128, 2, 128, 128, // + 0, 128, 2, 128, 128, 4, 128, 128, // + 128, 0, 2, 128, 128, 4, 128, 128, // + 0, 2, 4, 128, 128, 6, 128, 128, // + 128, 128, 128, 0, 128, 2, 128, 128, // + 0, 128, 128, 2, 128, 4, 128, 128, // + 128, 0, 128, 2, 128, 4, 128, 128, // + 0, 2, 128, 4, 128, 6, 128, 128, // + 128, 128, 0, 2, 128, 4, 128, 128, // + 0, 128, 2, 4, 128, 6, 128, 128, // + 128, 0, 2, 4, 128, 6, 128, 128, // + 0, 2, 4, 6, 128, 8, 128, 128, // + 128, 128, 128, 128, 0, 2, 128, 128, // + 0, 128, 128, 128, 2, 4, 128, 128, // + 128, 0, 128, 128, 2, 4, 128, 128, // + 0, 2, 128, 128, 4, 6, 128, 128, // + 128, 128, 0, 128, 2, 4, 128, 128, // + 0, 128, 2, 128, 4, 6, 128, 128, // + 128, 0, 2, 128, 4, 6, 128, 128, // + 0, 2, 4, 128, 6, 8, 128, 128, // + 128, 128, 128, 0, 2, 4, 128, 128, // + 0, 128, 128, 2, 4, 6, 128, 128, // + 128, 0, 128, 2, 4, 6, 128, 128, // + 0, 2, 128, 4, 6, 8, 128, 128, // + 128, 128, 0, 2, 4, 6, 128, 128, // + 0, 128, 2, 4, 6, 8, 128, 128, // + 128, 0, 2, 4, 6, 8, 128, 128, // + 0, 2, 4, 6, 8, 10, 128, 128, // + 128, 128, 128, 128, 128, 128, 0, 128, // + 0, 128, 128, 128, 128, 128, 2, 128, // + 128, 0, 128, 128, 128, 128, 2, 128, // + 0, 2, 128, 128, 128, 128, 4, 128, // + 128, 128, 0, 128, 128, 128, 2, 128, // + 0, 128, 2, 128, 128, 128, 4, 128, // + 128, 0, 2, 128, 128, 128, 4, 128, // + 0, 2, 4, 128, 128, 128, 6, 128, // + 128, 128, 128, 0, 128, 128, 2, 128, // + 0, 128, 128, 2, 128, 128, 4, 128, // + 128, 0, 128, 2, 128, 128, 4, 128, // + 0, 2, 128, 4, 128, 128, 6, 128, // + 128, 128, 0, 2, 128, 128, 4, 128, // + 0, 128, 2, 4, 128, 128, 6, 128, // + 128, 0, 2, 4, 128, 128, 6, 128, // + 0, 2, 4, 6, 128, 128, 8, 128, // + 128, 128, 128, 128, 0, 128, 2, 128, // + 0, 128, 128, 128, 2, 128, 4, 128, // + 128, 0, 128, 128, 2, 128, 4, 128, // + 0, 2, 128, 128, 4, 128, 6, 128, // + 128, 128, 0, 128, 2, 128, 4, 128, // + 0, 128, 2, 128, 4, 128, 6, 128, // + 128, 0, 2, 128, 4, 128, 6, 128, // + 0, 2, 4, 128, 6, 128, 8, 128, // + 128, 128, 128, 0, 2, 128, 4, 128, // + 0, 128, 128, 2, 4, 128, 6, 128, // + 128, 0, 128, 2, 4, 128, 6, 128, // + 0, 2, 128, 4, 6, 128, 8, 128, // + 128, 128, 0, 2, 4, 128, 6, 128, // + 0, 128, 2, 4, 6, 128, 8, 128, // + 128, 0, 2, 4, 6, 128, 8, 128, // + 0, 2, 4, 6, 8, 128, 10, 128, // + 128, 128, 128, 128, 128, 0, 2, 128, // + 0, 128, 128, 128, 128, 2, 4, 128, // + 128, 0, 128, 128, 128, 2, 4, 128, // + 0, 2, 128, 128, 128, 4, 6, 128, // + 128, 128, 0, 128, 128, 2, 4, 128, // + 0, 128, 2, 128, 128, 4, 6, 128, // + 128, 0, 2, 128, 128, 4, 6, 128, // + 0, 2, 4, 128, 128, 6, 8, 128, // + 128, 128, 128, 0, 128, 2, 4, 128, // + 0, 128, 128, 2, 128, 4, 6, 128, // + 128, 0, 128, 2, 128, 4, 6, 128, // + 0, 2, 128, 4, 128, 6, 8, 128, // + 128, 128, 0, 2, 128, 4, 6, 128, // + 0, 128, 2, 4, 128, 6, 8, 128, // + 128, 0, 2, 4, 128, 6, 8, 128, // + 0, 2, 4, 6, 128, 8, 10, 128, // + 128, 128, 128, 128, 0, 2, 4, 128, // + 0, 128, 128, 128, 2, 4, 6, 128, // + 128, 0, 128, 128, 2, 4, 6, 128, // + 0, 2, 128, 128, 4, 6, 8, 128, // + 128, 128, 0, 128, 2, 4, 6, 128, // + 0, 128, 2, 128, 4, 6, 8, 128, // + 128, 0, 2, 128, 4, 6, 8, 128, // + 0, 2, 4, 128, 6, 8, 10, 128, // + 128, 128, 128, 0, 2, 4, 6, 128, // + 0, 128, 128, 2, 4, 6, 8, 128, // + 128, 0, 128, 2, 4, 6, 8, 128, // + 0, 2, 128, 4, 6, 8, 10, 128, // + 128, 128, 0, 2, 4, 6, 8, 128, // + 0, 128, 2, 4, 6, 8, 10, 128, // + 128, 0, 2, 4, 6, 8, 10, 128, // + 0, 2, 4, 6, 8, 10, 12, 128, // + 128, 128, 128, 128, 128, 128, 128, 0, // + 0, 128, 128, 128, 128, 128, 128, 2, // + 128, 0, 128, 128, 128, 128, 128, 2, // + 0, 2, 128, 128, 128, 128, 128, 4, // + 128, 128, 0, 128, 128, 128, 128, 2, // + 0, 128, 2, 128, 128, 128, 128, 4, // + 128, 0, 2, 128, 128, 128, 128, 4, // + 0, 2, 4, 128, 128, 128, 128, 6, // + 128, 128, 128, 0, 128, 128, 128, 2, // + 0, 128, 128, 2, 128, 128, 128, 4, // + 128, 0, 128, 2, 128, 128, 128, 4, // + 0, 2, 128, 4, 128, 128, 128, 6, // + 128, 128, 0, 2, 128, 128, 128, 4, // + 0, 128, 2, 4, 128, 128, 128, 6, // + 128, 0, 2, 4, 128, 128, 128, 6, // + 0, 2, 4, 6, 128, 128, 128, 8, // + 128, 128, 128, 128, 0, 128, 128, 2, // + 0, 128, 128, 128, 2, 128, 128, 4, // + 128, 0, 128, 128, 2, 128, 128, 4, // + 0, 2, 128, 128, 4, 128, 128, 6, // + 128, 128, 0, 128, 2, 128, 128, 4, // + 0, 128, 2, 128, 4, 128, 128, 6, // + 128, 0, 2, 128, 4, 128, 128, 6, // + 0, 2, 4, 128, 6, 128, 128, 8, // + 128, 128, 128, 0, 2, 128, 128, 4, // + 0, 128, 128, 2, 4, 128, 128, 6, // + 128, 0, 128, 2, 4, 128, 128, 6, // + 0, 2, 128, 4, 6, 128, 128, 8, // + 128, 128, 0, 2, 4, 128, 128, 6, // + 0, 128, 2, 4, 6, 128, 128, 8, // + 128, 0, 2, 4, 6, 128, 128, 8, // + 0, 2, 4, 6, 8, 128, 128, 10, // + 128, 128, 128, 128, 128, 0, 128, 2, // + 0, 128, 128, 128, 128, 2, 128, 4, // + 128, 0, 128, 128, 128, 2, 128, 4, // + 0, 2, 128, 128, 128, 4, 128, 6, // + 128, 128, 0, 128, 128, 2, 128, 4, // + 0, 128, 2, 128, 128, 4, 128, 6, // + 128, 0, 2, 128, 128, 4, 128, 6, // + 0, 2, 4, 128, 128, 6, 128, 8, // + 128, 128, 128, 0, 128, 2, 128, 4, // + 0, 128, 128, 2, 128, 4, 128, 6, // + 128, 0, 128, 2, 128, 4, 128, 6, // + 0, 2, 128, 4, 128, 6, 128, 8, // + 128, 128, 0, 2, 128, 4, 128, 6, // + 0, 128, 2, 4, 128, 6, 128, 8, // + 128, 0, 2, 4, 128, 6, 128, 8, // + 0, 2, 4, 6, 128, 8, 128, 10, // + 128, 128, 128, 128, 0, 2, 128, 4, // + 0, 128, 128, 128, 2, 4, 128, 6, // + 128, 0, 128, 128, 2, 4, 128, 6, // + 0, 2, 128, 128, 4, 6, 128, 8, // + 128, 128, 0, 128, 2, 4, 128, 6, // + 0, 128, 2, 128, 4, 6, 128, 8, // + 128, 0, 2, 128, 4, 6, 128, 8, // + 0, 2, 4, 128, 6, 8, 128, 10, // + 128, 128, 128, 0, 2, 4, 128, 6, // + 0, 128, 128, 2, 4, 6, 128, 8, // + 128, 0, 128, 2, 4, 6, 128, 8, // + 0, 2, 128, 4, 6, 8, 128, 10, // + 128, 128, 0, 2, 4, 6, 128, 8, // + 0, 128, 2, 4, 6, 8, 128, 10, // + 128, 0, 2, 4, 6, 8, 128, 10, // + 0, 2, 4, 6, 8, 10, 128, 12, // + 128, 128, 128, 128, 128, 128, 0, 2, // + 0, 128, 128, 128, 128, 128, 2, 4, // + 128, 0, 128, 128, 128, 128, 2, 4, // + 0, 2, 128, 128, 128, 128, 4, 6, // + 128, 128, 0, 128, 128, 128, 2, 4, // + 0, 128, 2, 128, 128, 128, 4, 6, // + 128, 0, 2, 128, 128, 128, 4, 6, // + 0, 2, 4, 128, 128, 128, 6, 8, // + 128, 128, 128, 0, 128, 128, 2, 4, // + 0, 128, 128, 2, 128, 128, 4, 6, // + 128, 0, 128, 2, 128, 128, 4, 6, // + 0, 2, 128, 4, 128, 128, 6, 8, // + 128, 128, 0, 2, 128, 128, 4, 6, // + 0, 128, 2, 4, 128, 128, 6, 8, // + 128, 0, 2, 4, 128, 128, 6, 8, // + 0, 2, 4, 6, 128, 128, 8, 10, // + 128, 128, 128, 128, 0, 128, 2, 4, // + 0, 128, 128, 128, 2, 128, 4, 6, // + 128, 0, 128, 128, 2, 128, 4, 6, // + 0, 2, 128, 128, 4, 128, 6, 8, // + 128, 128, 0, 128, 2, 128, 4, 6, // + 0, 128, 2, 128, 4, 128, 6, 8, // + 128, 0, 2, 128, 4, 128, 6, 8, // + 0, 2, 4, 128, 6, 128, 8, 10, // + 128, 128, 128, 0, 2, 128, 4, 6, // + 0, 128, 128, 2, 4, 128, 6, 8, // + 128, 0, 128, 2, 4, 128, 6, 8, // + 0, 2, 128, 4, 6, 128, 8, 10, // + 128, 128, 0, 2, 4, 128, 6, 8, // + 0, 128, 2, 4, 6, 128, 8, 10, // + 128, 0, 2, 4, 6, 128, 8, 10, // + 0, 2, 4, 6, 8, 128, 10, 12, // + 128, 128, 128, 128, 128, 0, 2, 4, // + 0, 128, 128, 128, 128, 2, 4, 6, // + 128, 0, 128, 128, 128, 2, 4, 6, // + 0, 2, 128, 128, 128, 4, 6, 8, // + 128, 128, 0, 128, 128, 2, 4, 6, // + 0, 128, 2, 128, 128, 4, 6, 8, // + 128, 0, 2, 128, 128, 4, 6, 8, // + 0, 2, 4, 128, 128, 6, 8, 10, // + 128, 128, 128, 0, 128, 2, 4, 6, // + 0, 128, 128, 2, 128, 4, 6, 8, // + 128, 0, 128, 2, 128, 4, 6, 8, // + 0, 2, 128, 4, 128, 6, 8, 10, // + 128, 128, 0, 2, 128, 4, 6, 8, // + 0, 128, 2, 4, 128, 6, 8, 10, // + 128, 0, 2, 4, 128, 6, 8, 10, // + 0, 2, 4, 6, 128, 8, 10, 12, // + 128, 128, 128, 128, 0, 2, 4, 6, // + 0, 128, 128, 128, 2, 4, 6, 8, // + 128, 0, 128, 128, 2, 4, 6, 8, // + 0, 2, 128, 128, 4, 6, 8, 10, // + 128, 128, 0, 128, 2, 4, 6, 8, // + 0, 128, 2, 128, 4, 6, 8, 10, // + 128, 0, 2, 128, 4, 6, 8, 10, // + 0, 2, 4, 128, 6, 8, 10, 12, // + 128, 128, 128, 0, 2, 4, 6, 8, // + 0, 128, 128, 2, 4, 6, 8, 10, // + 128, 0, 128, 2, 4, 6, 8, 10, // + 0, 2, 128, 4, 6, 8, 10, 12, // + 128, 128, 0, 2, 4, 6, 8, 10, // + 0, 128, 2, 4, 6, 8, 10, 12, // + 128, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14}; + // Extend to double length because InterleaveLower will only use the (valid) + // lower half, and we want N u16. + const Twice<decltype(du8)> du8x2; + const Vec128<uint8_t, 2 * N> indices8 = + ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8)); + const Vec128<uint16_t, N> indices16 = + BitCast(du, InterleaveLower(du8x2, indices8, indices8)); + // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte + // indices, add 0 to even and 1 to odd byte lanes. + const Vec128<uint16_t, N> byte_indices = Add(indices16, Set(du, 0x0100)); + return BitCast(d, TableLookupBytesOr0(v, byte_indices)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + + const uint64_t mask_bits = detail::BitsFromMask(mask); + + alignas(16) static constexpr uint32_t packed_array[16] = { + // PrintExpand64x4Nibble - same for 32x4. + 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, + 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, + 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2). + const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]); + alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12}; + Vec128<uint32_t, N> indices = packed >> Load(du, shifts); + // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec + // checks bounds, so clear the upper bits. + indices = And(indices, Set(du, N - 1)); + const Vec128<uint32_t, N> expand = + TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices)); + // TableLookupLanes cannot also zero masked-off lanes, so do that now. + return IfThenElseZero(mask, BitCast(d, expand)); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { + // Same as Compress, just zero out the mask=false lanes. + return IfThenElseZero(mask, Compress(v, mask)); +} + +// For single-element vectors, this is at least as fast as native. +template <typename T> +HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) { + return IfThenElseZero(mask, v); +} + +// ------------------------------ LoadExpand +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + return Expand(LoadU(d, unaligned), mask); +} + +#endif // HWY_NATIVE_EXPAND + +// ------------------------------ TwoTablesLookupLanes + +template <class D> +using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>()))); + +// RVV/SVE have their own implementations of +// TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx) +#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \ + HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \ + HWY_TARGET != HWY_SVE2_128 +template <class D> +HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b, + IndicesFromD<D> idx) { + return TwoTablesLookupLanes(a, b, idx); +} +#endif + +// ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit) + +#if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +#undef HWY_PREFER_ROTATE +// Platforms on which RotateRight is likely faster than TableLookupBytes. +// RVV and SVE anyway have their own implementation of this. +#if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \ + HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8 +#define HWY_PREFER_ROTATE 1 +#else +#define HWY_PREFER_ROTATE 0 +#endif + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions. +#if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3 + const Repartition<uint16_t, decltype(d)> du16; + return BitCast(d, RotateRight<8>(BitCast(du16, v))); +#else + alignas(16) static constexpr TFromD<D> kShuffle[16] = { + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return TableLookupBytes(v, LoadDup128(d, kShuffle)); +#endif +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { +#if HWY_PREFER_ROTATE + const Repartition<uint16_t, decltype(d)> du16; + return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v)))); +#else + alignas(16) static constexpr uint8_t kShuffle[16] = { + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}; + const Repartition<uint8_t, decltype(d)> du8; + return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle))); +#endif +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { +#if HWY_PREFER_ROTATE + const Repartition<uint32_t, D> du32; + return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v)))); +#else + alignas(16) static constexpr uint8_t kShuffle[16] = { + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + const Repartition<uint8_t, decltype(d)> du8; + return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle))); +#endif +} + +#endif // HWY_NATIVE_REVERSE2_8 + +// ------------------------------ ReverseLaneBytes + +#if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_REVERSE_LANE_BYTES +#undef HWY_NATIVE_REVERSE_LANE_BYTES +#else +#define HWY_NATIVE_REVERSE_LANE_BYTES +#endif + +template <class V, HWY_IF_T_SIZE_V(V, 2)> +HWY_API V ReverseLaneBytes(V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, Reverse2(du8, BitCast(du8, v))); +} + +template <class V, HWY_IF_T_SIZE_V(V, 4)> +HWY_API V ReverseLaneBytes(V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, Reverse4(du8, BitCast(du8, v))); +} + +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_API V ReverseLaneBytes(V v) { + const DFromV<V> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, Reverse8(du8, BitCast(du8, v))); +} + +#endif // HWY_NATIVE_REVERSE_LANE_BYTES + +// ------------------------------ ReverseBits + +// On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore +// require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit +// shifts because those would add extra masking already taken care of by +// UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to +// implement ReverseBits, so this code is not used there. +#undef HWY_REVERSE_BITS_MIN_BYTES +#if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \ + HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256) +#define HWY_REVERSE_BITS_MIN_BYTES 2 +#else +#define HWY_REVERSE_BITS_MIN_BYTES 1 +#endif + +#if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_REVERSE_BITS_UI8 +#undef HWY_NATIVE_REVERSE_BITS_UI8 +#else +#define HWY_NATIVE_REVERSE_BITS_UI8 +#endif + +namespace detail { + +template <int kShiftAmt, int kShrResultMask, class V, + HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)> +HWY_INLINE V UI8ReverseBitsStep(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; +#if HWY_REVERSE_BITS_MIN_BYTES == 2 + const Repartition<uint16_t, decltype(d)> d_shift; +#else + const RebindToUnsigned<decltype(d)> d_shift; +#endif + + const auto v_to_shift = BitCast(d_shift, v); + const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift)); + const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift)); + const auto shr_result_mask = + BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask))); + return Or(And(shr_result, shr_result_mask), + AndNot(shr_result_mask, shl_result)); +} + +#if HWY_REVERSE_BITS_MIN_BYTES == 2 +template <int kShiftAmt, int kShrResultMask, class V, + HWY_IF_V_SIZE_D(DFromV<V>, 1)> +HWY_INLINE V UI8ReverseBitsStep(V v) { + return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw}) + .raw}; +} +#endif + +} // namespace detail + +template <class V, HWY_IF_T_SIZE_V(V, 1)> +HWY_API V ReverseBits(V v) { + auto result = detail::UI8ReverseBitsStep<1, 0x55>(v); + result = detail::UI8ReverseBitsStep<2, 0x33>(result); + result = detail::UI8ReverseBitsStep<4, 0x0F>(result); + return result; +} + +#endif // HWY_NATIVE_REVERSE_BITS_UI8 + +#if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#else +#define HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#endif + +template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)), + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V ReverseBits(V v) { + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> du8; + return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v)))); +} +#endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64 + +// ================================================== Operator wrapper + +// SVE* and RVV currently cannot define operators and have already defined +// (only) the corresponding functions such as Add. +#if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS +#undef HWY_NATIVE_OPERATOR_REPLACEMENTS +#else +#define HWY_NATIVE_OPERATOR_REPLACEMENTS +#endif + +template <class V> +HWY_API V Add(V a, V b) { + return a + b; +} +template <class V> +HWY_API V Sub(V a, V b) { + return a - b; +} + +template <class V> +HWY_API V Mul(V a, V b) { + return a * b; +} +template <class V> +HWY_API V Div(V a, V b) { + return a / b; +} + +template <class V> +V Shl(V a, V b) { + return a << b; +} +template <class V> +V Shr(V a, V b) { + return a >> b; +} + +template <class V> +HWY_API auto Eq(V a, V b) -> decltype(a == b) { + return a == b; +} +template <class V> +HWY_API auto Ne(V a, V b) -> decltype(a == b) { + return a != b; +} +template <class V> +HWY_API auto Lt(V a, V b) -> decltype(a == b) { + return a < b; +} + +template <class V> +HWY_API auto Gt(V a, V b) -> decltype(a == b) { + return a > b; +} +template <class V> +HWY_API auto Ge(V a, V b) -> decltype(a == b) { + return a >= b; +} + +template <class V> +HWY_API auto Le(V a, V b) -> decltype(a == b) { + return a <= b; +} + +#endif // HWY_NATIVE_OPERATOR_REPLACEMENTS + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/ppc_vsx-inl.h b/third_party/highway/hwy/ops/ppc_vsx-inl.h new file mode 100644 index 0000000000..49b1b6525f --- /dev/null +++ b/third_party/highway/hwy/ops/ppc_vsx-inl.h @@ -0,0 +1,4920 @@ +// Copyright 2023 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit vectors for VSX +// External include guard in highway.h - see comment there. + +#pragma push_macro("vector") +#pragma push_macro("pixel") +#pragma push_macro("bool") + +#undef vector +#undef pixel +#undef bool + +#include <altivec.h> + +#pragma pop_macro("vector") +#pragma pop_macro("pixel") +#pragma pop_macro("bool") + +#include <string.h> // memcpy + +#include "hwy/ops/shared-inl.h" + +// clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__. +// This means we can only use POWER10-specific intrinsics in static dispatch +// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9. +// On other compilers, the usual target check is sufficient. +#if HWY_TARGET <= HWY_PPC9 && \ + (!HWY_COMPILER_CLANG || defined(__POWER9_VECTOR__)) +#define HWY_PPC_HAVE_9 1 +#else +#define HWY_PPC_HAVE_9 0 +#endif + +#if HWY_TARGET <= HWY_PPC10 && \ + (!HWY_COMPILER_CLANG || defined(__POWER10_VECTOR__)) +#define HWY_PPC_HAVE_10 1 +#else +#define HWY_PPC_HAVE_10 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +template <typename T> +struct Raw128; + +// Each Raw128 specialization defines the following typedefs: +// - type: +// the backing Altivec/VSX raw vector type of the Vec128<T, N> type +// - RawBoolVec: +// the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type +// - RawT: +// the lane type for intrinsics, in particular vec_splat +// - AlignedRawVec: +// the 128-bit GCC/Clang vector type for aligned loads/stores +// - UnalignedRawVec: +// the 128-bit GCC/Clang vector type for unaligned loads/stores +#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \ + template <> \ + struct Raw128<LANE_TYPE> { \ + using type = __vector RAW_VECT_LANE_TYPE; \ + using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \ + using RawT = RAW_VECT_LANE_TYPE; \ + typedef LANE_TYPE AlignedRawVec \ + __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \ + typedef LANE_TYPE UnalignedRawVec __attribute__(( \ + __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \ + }; + +HWY_VSX_RAW128(int8_t, signed char, char) +HWY_VSX_RAW128(uint8_t, unsigned char, char) +HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int) +HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int) +HWY_VSX_RAW128(int32_t, signed int, int) +HWY_VSX_RAW128(uint32_t, unsigned int, int) +HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int) +HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int) +HWY_VSX_RAW128(float, float, int) +HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int) + +template <> +struct Raw128<bfloat16_t> : public Raw128<uint16_t> {}; + +template <> +struct Raw128<float16_t> : public Raw128<uint16_t> {}; + +#undef HWY_VSX_RAW128 + +} // namespace detail + +template <typename T, size_t N = 16 / sizeof(T)> +class Vec128 { + using Raw = typename detail::Raw128<T>::type; + + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = N; // only for DFromV + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template <typename T> +using Vec64 = Vec128<T, 8 / sizeof(T)>; + +template <typename T> +using Vec32 = Vec128<T, 4 / sizeof(T)>; + +template <typename T> +using Vec16 = Vec128<T, 2 / sizeof(T)>; + +// FF..FF or 0. +template <typename T, size_t N = 16 / sizeof(T)> +struct Mask128 { + typename detail::Raw128<T>::RawBoolVec raw; + + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = N; // only for DFromM +}; + +template <class V> +using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; + +template <class M> +using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; + +template <class V> +using TFromV = typename V::PrivateT; + +// ------------------------------ Zero + +// Returns an all-zero vector/part. +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + // There is no vec_splats for 64-bit, so we cannot rely on casting the 0 + // argument in order to select the correct overload. We instead cast the + // return vector type; see also the comment in BitCast. + return Vec128<T, HWY_MAX_LANES_D(D)>{ + reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))}; +} + +template <class D> +using VFromD = decltype(Zero(D())); + +// ------------------------------ Tuple (VFromD) +#include "hwy/ops/tuple-inl.h" + +// ------------------------------ BitCast + +template <class D, typename FromT> +HWY_API VFromD<D> BitCast(D /*d*/, + Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { + // C-style casts are not sufficient when compiling with + // -fno-lax-vector-conversions, which will be the future default in Clang, + // but reinterpret_cast is. + return VFromD<D>{ + reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)}; +} + +// ------------------------------ ResizeBitCast + +template <class D, typename FromV> +HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) { + // C-style casts are not sufficient when compiling with + // -fno-lax-vector-conversions, which will be the future default in Clang, + // but reinterpret_cast is. + return VFromD<D>{ + reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)}; +} + +// ------------------------------ Set + +// Returns a vector/part with all lanes set to "t". +template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + using RawLane = typename detail::Raw128<TFromD<D>>::RawT; + return VFromD<D>{vec_splats(static_cast<RawLane>(t))}; +} + +// Returns a vector with uninitialized elements. +template <class D> +HWY_API VFromD<D> Undefined(D d) { +#if HWY_COMPILER_GCC_ACTUAL + // Suppressing maybe-uninitialized both here and at the caller does not work, + // so initialize. + return Zero(d); +#else + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + typename detail::Raw128<TFromD<D>>::type raw; + return VFromD<decltype(d)>{raw}; + HWY_DIAGNOSTICS(pop) +#endif +} + +// ------------------------------ GetLane + +// Gets the single value stored in a vector/part. + +template <typename T, size_t N> +HWY_API T GetLane(Vec128<T, N> v) { + return static_cast<T>(v.raw[0]); +} + +// ================================================== LOGICAL + +// ------------------------------ And + +template <typename T, size_t N> +HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)}); +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template <typename T, size_t N> +HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { + const DFromV<decltype(mask)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast( + d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)}); +} + +// ------------------------------ Or + +template <typename T, size_t N> +HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)}); +} + +// ------------------------------ Xor + +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)}); +} + +// ------------------------------ Not +template <typename T, size_t N> +HWY_API Vec128<T, N> Not(Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)}); +} + +// ------------------------------ IsConstantRawAltivecVect +namespace detail { + +template <class RawV> +static HWY_INLINE bool IsConstantRawAltivecVect( + hwy::SizeTag<1> /* lane_size_tag */, RawV v) { + return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && + __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && + __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && + __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && + __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && + __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && + __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && + __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]); +} + +template <class RawV> +static HWY_INLINE bool IsConstantRawAltivecVect( + hwy::SizeTag<2> /* lane_size_tag */, RawV v) { + return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && + __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && + __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && + __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]); +} + +template <class RawV> +static HWY_INLINE bool IsConstantRawAltivecVect( + hwy::SizeTag<4> /* lane_size_tag */, RawV v) { + return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && + __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]); +} + +template <class RawV> +static HWY_INLINE bool IsConstantRawAltivecVect( + hwy::SizeTag<8> /* lane_size_tag */, RawV v) { + return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]); +} + +template <class RawV> +static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) { + return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v); +} + +} // namespace detail + +// ------------------------------ TernaryLogic +#if HWY_PPC_HAVE_10 +namespace detail { + +// NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse +// order of the kTernLogOp bits of AVX3 +// _mm_ternarylogic_epi64(a, b, c, kTernLogOp) +template <uint8_t kTernLogOp, class V> +HWY_INLINE V TernaryLogic(V a, V b, V c) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const auto a_raw = BitCast(du, a).raw; + const auto b_raw = BitCast(du, b).raw; + const auto c_raw = BitCast(du, c).raw; + +#if HWY_COMPILER_GCC_ACTUAL + // Use inline assembly on GCC to work around GCC compiler bug + typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result; + __asm__("xxeval %x0,%x1,%x2,%x3,%4" + : "=wa"(raw_ternlog_result) + : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw), "n"(kTernLogOp) + :); +#else + const auto raw_ternlog_result = + vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp); +#endif + + return BitCast(d, VU{raw_ternlog_result}); +} + +} // namespace detail +#endif // HWY_PPC_HAVE_10 + +// ------------------------------ Xor3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { +#if HWY_PPC_HAVE_10 +#if defined(__OPTIMIZE__) + if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) + + static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) + + static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >= + 2) { + return Xor(x1, Xor(x2, x3)); + } else // NOLINT +#endif + { + return detail::TernaryLogic<0x69>(x1, x2, x3); + } +#else + return Xor(x1, Xor(x2, x3)); +#endif +} + +// ------------------------------ Or3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { +#if HWY_PPC_HAVE_10 +#if defined(__OPTIMIZE__) + if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) + + static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) + + static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >= + 2) { + return Or(o1, Or(o2, o3)); + } else // NOLINT +#endif + { + return detail::TernaryLogic<0x7F>(o1, o2, o3); + } +#else + return Or(o1, Or(o2, o3)); +#endif +} + +// ------------------------------ OrAnd +template <typename T, size_t N> +HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { +#if HWY_PPC_HAVE_10 +#if defined(__OPTIMIZE__) + if (detail::IsConstantRawAltivecVect(a1.raw) && + detail::IsConstantRawAltivecVect(a2.raw)) { + return Or(o, And(a1, a2)); + } else // NOLINT +#endif + { + return detail::TernaryLogic<0x1F>(o, a1, a2); + } +#else + return Or(o, And(a1, a2)); +#endif +} + +// ------------------------------ IfVecThenElse +template <typename T, size_t N> +HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + const DFromV<decltype(yes)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast( + d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw, + BitCast(du, mask).raw)}); +} + +// ------------------------------ BitwiseIfThenElse + +#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#else +#define HWY_NATIVE_BITWISE_IF_THEN_ELSE +#endif + +template <class V> +HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { + return IfVecThenElse(mask, yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) { + return And(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) { + return Or(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) { + return Xor(a, b); +} + +// ================================================== SIGN + +// ------------------------------ Neg + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_INLINE Vec128<T, N> Neg(Vec128<T, N> v) { + return Vec128<T, N>{vec_neg(v.raw)}; +} + +// ------------------------------ Abs + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +template <class T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Vec128<T, N> Abs(Vec128<T, N> v) { + return Vec128<T, N>{vec_abs(v.raw)}; +} + +// ------------------------------ CopySign + +template <size_t N> +HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn, + Vec128<float, N> sign) { + // Work around compiler bugs that are there with vec_cpsgn on older versions + // of GCC/Clang +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 + return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)}; +#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ + HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp) + return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)}; +#else + return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)}; +#endif +} + +template <size_t N> +HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn, + Vec128<double, N> sign) { + // Work around compiler bugs that are there with vec_cpsgn on older versions + // of GCC/Clang +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 + return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)}; +#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ + HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp) + return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)}; +#else + return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)}; +#endif +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) { + // PPC8 can also handle abs < 0, so no extra action needed. + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + return CopySign(abs, sign); +} + +// ================================================== MEMORY (1) + +// Note: type punning is safe because the types are tagged with may_alias. +// (https://godbolt.org/z/fqrWjfjsP) + +// ------------------------------ Load + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { + using LoadRaw = typename detail::Raw128<T>::AlignedRawVec; + const LoadRaw* HWY_RESTRICT p = reinterpret_cast<const LoadRaw*>(aligned); + using ResultRaw = typename detail::Raw128<T>::type; + return Vec128<T>{reinterpret_cast<ResultRaw>(*p)}; +} + +// Any <= 64 bit +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> +HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) { + using BitsT = UnsignedFromSize<d.MaxBytes()>; + + BitsT bits; + const Repartition<BitsT, decltype(d)> d_bits; + CopyBytes<d.MaxBytes()>(p, &bits); + return BitCast(d, Set(d_bits, bits)); +} + +// ================================================== MASK + +// ------------------------------ Mask + +// Mask and Vec are both backed by vector types (true = FF..FF). +template <typename T, size_t N> +HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) { + using Raw = typename detail::Raw128<T>::RawBoolVec; + return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)}; +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +template <typename T, size_t N> +HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) { + return Vec128<T, N>{ + reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)}; +} + +template <class D> +HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { + return VFromD<D>{ + reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)}; +} + +// mask ? yes : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + const DFromV<decltype(yes)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, VFromD<decltype(du)>{vec_sel( + BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)}); +} + +// mask ? yes : 0 +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { + const DFromV<decltype(yes)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, + VFromD<decltype(du)>{vec_and(BitCast(du, yes).raw, mask.raw)}); +} + +// mask ? 0 : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { + const DFromV<decltype(no)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, + VFromD<decltype(du)>{vec_andc(BitCast(du, no).raw, mask.raw)}); +} + +// ------------------------------ Mask logical + +template <typename T, size_t N> +HWY_API Mask128<T, N> Not(Mask128<T, N> m) { + return Mask128<T, N>{vec_nor(m.raw, m.raw)}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) { + return Mask128<T, N>{vec_and(a.raw, b.raw)}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) { + return Mask128<T, N>{vec_andc(b.raw, a.raw)}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) { + return Mask128<T, N>{vec_or(a.raw, b.raw)}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) { + return Mask128<T, N>{vec_xor(a.raw, b.raw)}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) { + return Mask128<T, N>{vec_nor(a.raw, b.raw)}; +} + +// ------------------------------ BroadcastSignBit + +template <size_t N> +HWY_API Vec128<int8_t, N> BroadcastSignBit(Vec128<int8_t, N> v) { + return Vec128<int8_t, N>{ + vec_sra(v.raw, vec_splats(static_cast<unsigned char>(7)))}; +} + +template <size_t N> +HWY_API Vec128<int16_t, N> BroadcastSignBit(Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{ + vec_sra(v.raw, vec_splats(static_cast<unsigned short>(15)))}; +} + +template <size_t N> +HWY_API Vec128<int32_t, N> BroadcastSignBit(Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{vec_sra(v.raw, vec_splats(31u))}; +} + +template <size_t N> +HWY_API Vec128<int64_t, N> BroadcastSignBit(Vec128<int64_t, N> v) { + return Vec128<int64_t, N>{vec_sra(v.raw, vec_splats(63ULL))}; +} + +// ------------------------------ ShiftLeftSame + +template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) { + using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT; + return Vec128<T, N>{vec_sl(v.raw, vec_splats(static_cast<TU>(bits)))}; +} + +// ------------------------------ ShiftRightSame + +template <typename T, size_t N, HWY_IF_UNSIGNED(T)> +HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) { + using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT; + return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))}; +} + +template <typename T, size_t N, HWY_IF_SIGNED(T)> +HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) { + using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT; + return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))}; +} + +// ------------------------------ ShiftLeft + +template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + return ShiftLeftSame(v, kBits); +} + +// ------------------------------ ShiftRight + +template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + return ShiftRightSame(v, kBits); +} + +// ================================================== SWIZZLE (1) + +// ------------------------------ TableLookupBytes +template <typename T, size_t N, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes, + Vec128<TI, NI> from) { + const Repartition<uint8_t, DFromV<decltype(from)>> du8_from; + return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>( + vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))}; +} + +// ------------------------------ TableLookupBytesOr0 +// For all vector widths; Altivec/VSX needs zero out +template <class V, class VI> +HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { + const DFromV<VI> di; + Repartition<int8_t, decltype(di)> di8; + const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from))); + return AndNot(zeroOutMask, TableLookupBytes(bytes, from)); +} + +// ------------------------------ Reverse +template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)> +HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) { + return Vec128<T>{vec_reve(v.raw)}; +} + +// ------------------------------ Shuffles (Reverse) + +// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template <typename T, size_t N> +HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, + 12, 13, 14, 15, 8, 9, 10, 11}; + return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)}; +} + +// These are used by generic_ops-inl to implement LoadInterleaved3. As with +// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output +// comes from the first argument. +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) { + const __vector unsigned char kShuffle16 = {1, 0, 19, 18}; + return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) { + const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21}; + return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) { + const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, + 28, 29, 30, 31, 24, 25, 26, 27}; + return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) { + const __vector unsigned char kShuffle = {0, 3, 18, 17}; + return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) { + const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19}; + return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) { + const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15, + 24, 25, 26, 27, 20, 21, 22, 23}; + return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) { + const __vector unsigned char kShuffle = {2, 1, 16, 19}; + return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) { + const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23}; + return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) { + const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7, + 16, 17, 18, 19, 28, 29, 30, 31}; + return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)}; +} + +} // namespace detail + +// Swap 64-bit halves +template <class T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Shuffle1032(Vec128<T> v) { + const Full128<T> d; + const Full128<uint64_t> du64; + return BitCast(d, Reverse(du64, BitCast(du64, v))); +} +template <class T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Shuffle01(Vec128<T> v) { + return Reverse(Full128<T>(), v); +} + +// Rotate right 32 bits +template <class T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Shuffle0321(Vec128<T> v) { +#if HWY_IS_LITTLE_ENDIAN + return Vec128<T>{vec_sld(v.raw, v.raw, 12)}; +#else + return Vec128<T>{vec_sld(v.raw, v.raw, 4)}; +#endif +} +// Rotate left 32 bits +template <class T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Shuffle2103(Vec128<T> v) { +#if HWY_IS_LITTLE_ENDIAN + return Vec128<T>{vec_sld(v.raw, v.raw, 4)}; +#else + return Vec128<T>{vec_sld(v.raw, v.raw, 12)}; +#endif +} + +template <class T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Shuffle0123(Vec128<T> v) { + return Reverse(Full128<T>(), v); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template <class DTo, typename TFrom, size_t NFrom> +HWY_API MFromD<DTo> RebindMask(DTo /*dto*/, Mask128<TFrom, NFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); + return MFromD<DTo>{m.raw}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) { + return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)}; +} + +// ------------------------------ Inequality + +// This cannot have T as a template argument, otherwise it is not more +// specialized than rewritten operator== in C++20, leading to compile +// errors: https://gcc.godbolt.org/z/xsrPhPvPT. +template <size_t N> +HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { +#if HWY_PPC_HAVE_9 + return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)}; +#else + return Not(a == b); +#endif +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { +#if HWY_PPC_HAVE_9 + return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)}; +#else + return Not(a == b); +#endif +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { +#if HWY_PPC_HAVE_9 + return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)}; +#else + return Not(a == b); +#endif +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { +#if HWY_PPC_HAVE_9 + return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)}; +#else + return Not(a == b); +#endif +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { +#if HWY_PPC_HAVE_9 + return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)}; +#else + return Not(a == b); +#endif +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { +#if HWY_PPC_HAVE_9 + return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)}; +#else + return Not(a == b); +#endif +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Not(a == b); +} + +template <size_t N> +HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { + return Not(a == b); +} + +template <size_t N> +HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, + Vec128<double, N> b) { + return Not(a == b); +} + +// ------------------------------ Strict inequality + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { + return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)}; +} + +// ------------------------------ Weak inequality + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { + return Mask128<T, N>{vec_cmpge(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { + return Not(b > a); +} + +// ------------------------------ Reversed comparisons + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { + return b > a; +} + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { + return b >= a; +} + +// ================================================== MEMORY (2) + +// ------------------------------ Load +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) { + using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec; + const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p); + using ResultRaw = typename detail::Raw128<T>::type; + return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)}; +} + +// For < 128 bit, LoadU == Load. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> +HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) { + return LoadU(d, p); +} + +// Returns a vector with lane i=[0, N) set to "first" + i. +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<D> Iota0(D d) { + constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0}); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; + return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0}); +} + +template <class D, HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3}; + return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0}); +} + +template <class D, HWY_IF_UI64_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { + constexpr __vector unsigned long long kU64Iota0 = {0, 1}; + return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0}); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f}; + return VFromD<D>{kF32Iota0}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + constexpr __vector double kF64Iota0 = {0.0, 1.0}; + return VFromD<D>{kF64Iota0}; +} + +} // namespace detail + +template <class D, typename T2> +HWY_API VFromD<D> Iota(D d, const T2 first) { + return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); +} + +// ------------------------------ FirstN (Iota, Lt) + +template <class D> +HWY_API MFromD<D> FirstN(D d, size_t num) { + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num))); +} + +// ------------------------------ MaskedLoad +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) { + return IfThenElseZero(m, LoadU(d, p)); +} + +// ------------------------------ MaskedLoadOr +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, + const T* HWY_RESTRICT p) { + return IfThenElse(m, LoadU(d, p), v); +} + +// ------------------------------ Store + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + using StoreRaw = typename detail::Raw128<T>::AlignedRawVec; + *reinterpret_cast<StoreRaw*>(aligned) = reinterpret_cast<StoreRaw>(v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) { + using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec; + *reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> +HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) { + using BitsT = UnsignedFromSize<d.MaxBytes()>; + + const Repartition<BitsT, decltype(d)> d_bits; + const BitsT bits = GetLane(BitCast(d_bits, v)); + CopyBytes<d.MaxBytes()>(&bits, p); +} + +// For < 128 bit, StoreU == Store. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> +HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +// ------------------------------ BlendedStore + +template <class D> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t. + using TI = TFromD<decltype(di)>; + alignas(16) TI buf[MaxLanes(d)]; + alignas(16) TI mask[MaxLanes(d)]; + Store(BitCast(di, v), di, buf); + Store(BitCast(di, VecFromMask(d, m)), di, mask); + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (mask[i]) { + CopySameSize(buf + i, p + i); + } + } +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_add(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_sub(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +namespace detail { + +// Casts nominally uint32_t result to D. +template <class D> +HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a, + __vector unsigned int b) { + const Repartition<uint32_t, D> du32; +#ifdef __OPTIMIZE__ + if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { + const uint64_t sum0 = + static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) + + static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) + + static_cast<uint64_t>(b[0]); + const uint64_t sum1 = + static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) + + static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) + + static_cast<uint64_t>(b[1]); + const uint64_t sum2 = + static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) + + static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) + + static_cast<uint64_t>(b[2]); + const uint64_t sum3 = + static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) + + static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) + + static_cast<uint64_t>(b[3]); + return BitCast( + d, + VFromD<decltype(du32)>{(__vector unsigned int){ + static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu), + static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu), + static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu), + static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3 + : 0xFFFFFFFFu)}}); + } else // NOLINT +#endif + { + return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)}); + } +} + +// Casts nominally int32_t result to D. +template <class D> +HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a, + __vector signed int b) { + const Repartition<int32_t, D> di32; +#ifdef __OPTIMIZE__ + const Repartition<uint64_t, D> du64; + constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN; + if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) && + __builtin_constant_p(b[kDestLaneOffset + 2])) { + const int64_t sum0 = static_cast<int64_t>(a[0]) + + static_cast<int64_t>(a[1]) + + static_cast<int64_t>(b[kDestLaneOffset]); + const int64_t sum1 = static_cast<int64_t>(a[2]) + + static_cast<int64_t>(a[3]) + + static_cast<int64_t>(b[kDestLaneOffset + 2]); + const int32_t sign0 = static_cast<int32_t>(sum0 >> 63); + const int32_t sign1 = static_cast<int32_t>(sum1 >> 63); + return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){ + (sign0 == (sum0 >> 31)) + ? static_cast<uint32_t>(sum0) + : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF), + (sign1 == (sum1 >> 31)) + ? static_cast<uint32_t>(sum1) + : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}}); + } else // NOLINT +#endif + { + __vector signed int sum; + + // Inline assembly is used for vsum2sws to avoid unnecessary shuffling + // on little-endian PowerPC targets as the result of the vsum2sws + // instruction will already be in the correct lanes on little-endian + // PowerPC targets. + __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); + + return BitCast(d, VFromD<decltype(di32)>{sum}); + } +} + +} // namespace detail + +template <size_t N> +HWY_API Vec128<uint64_t, N / 8> SumsOf8(Vec128<uint8_t, N> v) { + const Repartition<uint64_t, DFromV<decltype(v)>> du64; + const Repartition<int32_t, decltype(du64)> di32; + const RebindToUnsigned<decltype(di32)> du32; + + return detail::AltivecVsum2sws( + du64, detail::AltivecVsum4ubs(di32, v.raw, Zero(du32).raw).raw, + Zero(di32).raw); +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB +#undef HWY_NATIVE_I32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB +#undef HWY_NATIVE_U32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U32_SATURATED_ADDSUB +#endif + +template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_adds(a.raw, b.raw)}; +} + +#if HWY_PPC_HAVE_10 + +#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB +#undef HWY_NATIVE_I64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I64_SATURATED_ADDSUB +#endif + +template <class V, HWY_IF_I64_D(DFromV<V>)> +HWY_API V SaturatedAdd(V a, V b) { + const DFromV<decltype(a)> d; + const auto sum = Add(a, b); + const auto overflow_mask = + MaskFromVec(BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum))); + const auto overflow_result = + Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); + return IfThenElse(overflow_mask, overflow_result, sum); +} + +#endif // HWY_PPC_HAVE_10 + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_subs(a.raw, b.raw)}; +} + +#if HWY_PPC_HAVE_10 + +template <class V, HWY_IF_I64_D(DFromV<V>)> +HWY_API V SaturatedSub(V a, V b) { + const DFromV<decltype(a)> d; + const auto diff = Sub(a, b); + const auto overflow_mask = + MaskFromVec(BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff))); + const auto overflow_result = + Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); + return IfThenElse(overflow_mask, overflow_result, diff); +} + +#endif // HWY_PPC_HAVE_10 + +// ------------------------------ AverageRound + +// Returns (a + b + 1) / 2 + +template <typename T, size_t N, HWY_IF_UNSIGNED(T), + HWY_IF_T_SIZE_ONE_OF(T, 0x6)> +HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_avg(a.raw, b.raw)}; +} + +// ------------------------------ Multiplication + +// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{a.raw * b.raw}; +} + +// Returns the upper 16 bits of a * b in each lane. +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RepartitionToWide<decltype(d)> dw; + const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)}; + const VFromD<decltype(dw)> p2{vec_mulo(a.raw, b.raw)}; +#if HWY_IS_LITTLE_ENDIAN + const __vector unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23, + 10, 11, 26, 27, 14, 15, 30, 31}; +#else + const __vector unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21, + 8, 9, 24, 25, 12, 13, 28, 29}; +#endif + return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)}); +} + +template <size_t N> +HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + const Vec128<int16_t> zero = Zero(Full128<int16_t>()); + return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4), HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a, + Vec128<T, N> b) { + return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)}; +} + +// ------------------------------ RotateRight +template <int kBits, typename T, size_t N> +HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Vec128<T, N>{vec_rl(v.raw, Set(d, kSizeInBits - kBits).raw)}; +} + +// ------------------------------ ZeroIfNegative (BroadcastSignBit) +template <typename T, size_t N> +HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only works for float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(mask, Zero(d), v); +} + +// ------------------------------ IfNegativeThenElse +template <typename T, size_t N> +HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, + Vec128<T, N> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return IfThenElse(MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))), + yes, no); +} + +// Absolute value of difference. +template <size_t N> +HWY_API Vec128<float, N> AbsDiff(Vec128<float, N> a, Vec128<float, N> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)}; +} + +// Returns add - mul * x +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> add) { + // NOTE: the vec_nmsub operation below computes -(mul * x - add), + // which is equivalent to add - mul * x in the round-to-nearest + // and round-towards-zero rounding modes + return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)}; +} + +// Returns mul * x - sub +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> sub) { + return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)}; +} + +// Returns -mul * x - sub +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, + Vec128<T, N> sub) { + // NOTE: The vec_nmadd operation below computes -(mul * x + sub), + // which is equivalent to -mul * x - sub in the round-to-nearest + // and round-towards-zero rounding modes + return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)}; +} + +// ------------------------------ Floating-point div +// Approximate reciprocal +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) { + return Vec128<float, N>{vec_re(v.raw)}; +} + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_div(a.raw, b.raw)}; +} + +// ------------------------------ Floating-point square root + +// Approximate reciprocal square root +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { + return Vec128<float, N>{vec_rsqrte(v.raw)}; +} + +// Full precision square root +template <class T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) { + return Vec128<T, N>{vec_sqrt(v.raw)}; +} + +// ------------------------------ Min (Gt, IfThenElse) + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_min(a.raw, b.raw)}; +} + +// ------------------------------ Max (Gt, IfThenElse) + +template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> +HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_max(a.raw, b.raw)}; +} + +// ------------------------------- Integer AbsDiff for PPC9/PPC10 + +#if HWY_PPC_HAVE_9 +#ifdef HWY_NATIVE_INTEGER_ABS_DIFF +#undef HWY_NATIVE_INTEGER_ABS_DIFF +#else +#define HWY_NATIVE_INTEGER_ABS_DIFF +#endif + +template <class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> +HWY_API V AbsDiff(const V a, const V b) { + return V{vec_absd(a.raw, b.raw)}; +} + +template <class V, HWY_IF_U64_D(DFromV<V>)> +HWY_API V AbsDiff(const V a, const V b) { + return Sub(Max(a, b), Min(a, b)); +} + +template <class V, HWY_IF_SIGNED_V(V)> +HWY_API V AbsDiff(const V a, const V b) { + return Sub(Max(a, b), Min(a, b)); +} + +#endif // HWY_PPC_HAVE_9 + +// ================================================== MEMORY (3) + +// ------------------------------ Non-temporal stores + +template <class D> +HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { + __builtin_prefetch(aligned, 1, 0); + Store(v, d, aligned); +} + +// ------------------------------ Scatter + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + alignas(16) TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + alignas(16) TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + for (size_t i = 0; i < MaxLanes(d); ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) + +template <class D, typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + alignas(16) T lanes[MaxLanes(d)]; + const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template <class D, typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + alignas(16) T lanes[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +// ================================================== SWIZZLE (2) + +// ------------------------------ LowerHalf + +// Returns upper/lower half of a vector. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { + return VFromD<D>{v.raw}; +} +template <typename T, size_t N> +HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { + return Vec128<T, N / 2>{v.raw}; +} + +// ------------------------------ ShiftLeftBytes + +// NOTE: The ShiftLeftBytes operation moves the elements of v to the right +// by kBytes bytes and zeroes out the first kBytes bytes of v on both +// little-endian and big-endian PPC targets +// (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both +// little-endian and big-endian targets) + +template <int kBytes, class D> +HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + if (kBytes == 0) return v; + const auto zeros = Zero(d); +#if HWY_IS_LITTLE_ENDIAN + return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)}; +#else + return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; +#endif +} + +template <int kBytes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { + return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftLeftLanes + +// NOTE: The ShiftLeftLanes operation moves the elements of v to the right +// by kLanes lanes and zeroes out the first kLanes lanes of v on both +// little-endian and big-endian PPC targets +// (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both +// little-endian and big-endian targets) + +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); +} + +template <int kLanes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) { + return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftRightBytes + +// NOTE: The ShiftRightBytes operation moves the elements of v to the left +// by kBytes bytes and zeroes out the last kBytes bytes of v on both +// little-endian and big-endian PPC targets +// (same behavior as the HWY_EMU128 ShiftRightBytes operation on both +// little-endian and big-endian targets) + +template <int kBytes, class D> +HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + if (kBytes == 0) return v; + + // For partial vectors, clear upper lanes so we shift in zeros. + if (d.MaxBytes() != 16) { + const Full128<TFromD<D>> dfull; + VFromD<decltype(dfull)> vfull{v.raw}; + v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; + } + + const auto zeros = Zero(d); +#if HWY_IS_LITTLE_ENDIAN + return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; +#else + return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)}; +#endif +} + +// ------------------------------ ShiftRightLanes + +// NOTE: The ShiftRightLanes operation moves the elements of v to the left +// by kLanes lanes and zeroes out the last kLanes lanes of v on both +// little-endian and big-endian PPC targets +// (same behavior as the HWY_EMU128 ShiftRightLanes operation on both +// little-endian and big-endian targets) + +template <int kLanes, class D> +HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); + return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { + return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); +} + +// ------------------------------ ExtractLane (UpperHalf) + +template <typename T, size_t N> +HWY_API T ExtractLane(Vec128<T, N> v, size_t i) { + return static_cast<T>(v.raw[i]); +} + +// ------------------------------ InsertLane (UpperHalf) + +template <typename T, size_t N> +HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) { + typename detail::Raw128<T>::type raw_result = v.raw; + raw_result[i] = t; + return Vec128<T, N>{raw_result}; +} + +// ------------------------------ CombineShiftRightBytes + +// NOTE: The CombineShiftRightBytes operation below moves the elements of lo to +// the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes() +// - kBytes) bytes on both little-endian and big-endian PPC targets. + +template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> CombineShiftRightBytes(D /*d*/, Vec128<T> hi, Vec128<T> lo) { + constexpr size_t kSize = 16; + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); +#if HWY_IS_LITTLE_ENDIAN + return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)}; +#else + return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)}; +#endif +} + +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kSize = d.MaxBytes(); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition<uint8_t, decltype(d)> d8; + using V8 = Vec128<uint8_t>; + const DFromV<V8> dfull8; + const Repartition<TFromD<D>, decltype(dfull8)> dfull; + const V8 hi8{BitCast(d8, hi).raw}; + // Move into most-significant bytes + const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); + const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); + return VFromD<D>{BitCast(dfull, r).raw}; +} + +// ------------------------------ Broadcast/splat any lane + +template <int kLane, typename T, size_t N> +HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<T, N>{vec_splat(v.raw, kLane)}; +} + +// ------------------------------ TableLookupLanes (Shuffle01) + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template <typename T, size_t N = 16 / sizeof(T)> +struct Indices128 { + __vector unsigned char raw; +}; + +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + return Iota(d8, 0); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + constexpr __vector unsigned char kBroadcastLaneBytes = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; +#else + constexpr __vector unsigned char kBroadcastLaneBytes = { + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; +#endif + return VFromD<decltype(d8)>{kBroadcastLaneBytes}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + constexpr __vector unsigned char kBroadcastLaneBytes = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; +#else + constexpr __vector unsigned char kBroadcastLaneBytes = { + 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15}; +#endif + return VFromD<decltype(d8)>{kBroadcastLaneBytes}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + constexpr __vector unsigned char kBroadcastLaneBytes = { + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; +#else + constexpr __vector unsigned char kBroadcastLaneBytes = { + 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15}; +#endif + return VFromD<decltype(d8)>{kBroadcastLaneBytes}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + return Zero(d8); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1}; + return VFromD<decltype(d8)>{kByteOffsets}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3}; + return VFromD<decltype(d8)>{kByteOffsets}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7}; + return VFromD<decltype(d8)>{kByteOffsets}; +} + +} // namespace detail + +template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( + D d, Vec128<TI, MaxLanes(D())> vec) { + using T = TFromD<D>; + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + HWY_DASSERT(AllTrue( + du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); +#endif + + const Repartition<uint8_t, decltype(d)> d8; + return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw}; +} + +template <class D, typename TI, + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( + D d, Vec128<TI, MaxLanes(D())> vec) { + using T = TFromD<D>; + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + HWY_DASSERT(AllTrue( + du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); +#endif + + const Repartition<uint8_t, decltype(d)> d8; + using V8 = VFromD<decltype(d8)>; + + // Broadcast each lane index to all bytes of T and shift to bytes + const V8 lane_indices = TableLookupBytes( + BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); + constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); + const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); + const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); + return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw}; +} + +template <class D, typename TI> +HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( + D d, const TI* idx) { + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw})); +} + +// Single lane: no change +template <typename T> +HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, + Indices128<T, 1> /* idx */) { + return v; +} + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, + Indices128<T, N> idx) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + const Repartition<uint8_t, decltype(dt)> dt_u8; +// TableLookupLanes currently requires table and index vectors to be the same +// size, though a half-length index vector would be sufficient here. +#if HWY_IS_MSAN + const Vec128<T, N> idx_vec{idx.raw}; + const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; +#else + // We only keep LowerHalf of the result, which is valid in idx. + const Indices128<T, N * 2> idx2{idx.raw}; +#endif + return LowerHalf( + d, TableLookupBytes(Combine(dt, b, a), + BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw}))); +} + +template <typename T> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T> idx) { + return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)}; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template <class D> +HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { + return v; +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301) + +// Single lane: no change +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { + return v; +} + +// 32-bit x2: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) { + return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw}; +} + +// 16-bit x4: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) { + const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, + 14, 15, 12, 13, 10, 11, 8, 9}; + return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)}; +} + +// 16-bit x2: rotate bytes +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec32<T> Reverse(D d, Vec32<T> v) { + const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32; + return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); +} + +// ------------------------------- ReverseLaneBytes + +#if HWY_PPC_HAVE_9 && \ + (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400) + +// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes. +#ifdef HWY_NATIVE_REVERSE_LANE_BYTES +#undef HWY_NATIVE_REVERSE_LANE_BYTES +#else +#define HWY_NATIVE_REVERSE_LANE_BYTES +#endif + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API V ReverseLaneBytes(V v) { + return V{vec_revb(v.raw)}; +} + +// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + const Repartition<uint16_t, decltype(d)> du16; + return BitCast(d, ReverseLaneBytes(BitCast(du16, v))); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + const Repartition<uint32_t, decltype(d)> du32; + return BitCast(d, ReverseLaneBytes(BitCast(du32, v))); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { + const Repartition<uint64_t, decltype(d)> du64; + return BitCast(d, ReverseLaneBytes(BitCast(du64, v))); +} + +#endif // HWY_PPC_HAVE_9 + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec16<T> Reverse(D d, Vec16<T> v) { + return Reverse2(d, v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> Reverse(D d, Vec32<T> v) { + return Reverse4(d, v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> Reverse(D d, Vec64<T> v) { + return Reverse8(d, v); +} + +// ------------------------------ Reverse2 + +// Single lane: no change +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API Vec128<T, 1> Reverse2(D /* tag */, Vec128<T, 1> v) { + return v; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + const Repartition<uint32_t, decltype(d)> du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { + const Repartition<uint64_t, decltype(d)> du64; + return BitCast(d, RotateRight<32>(BitCast(du64, v))); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse4(D /*d*/, VFromD<D> v) { + const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, + 14, 15, 12, 13, 10, 11, 8, 9}; + return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + return Reverse(d, v); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) { + HWY_ASSERT(0); // don't have 4 u64 lanes +} + +// ------------------------------ Reverse8 + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { + return Reverse(d, v); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) { + HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit +} + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +template <typename T, size_t N> +HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{vec_mergeh(a.raw, b.raw)}; +} + +// Additional overload for the optional tag +template <class D> +HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// Full +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) { + return Vec128<T>{vec_mergel(a.raw, b.raw)}; +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { + const Half<decltype(d)> d2; + return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, + VFromD<D>{UpperHalf(d2, b).raw}); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// N = N/2 + N/2 (upper half undefined) +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> +HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { + const Half<decltype(d)> dh; + // Treat half-width input as one lane, and expand to two lanes. + using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; + using Raw = typename detail::Raw128<TFromV<VU>>::type; + const VU lo{reinterpret_cast<Raw>(lo_half.raw)}; + const VU hi{reinterpret_cast<Raw>(hi_half.raw)}; + return BitCast(d, InterleaveLower(lo, hi)); +} + +// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) + +template <class D> +HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { + const Half<D> dh; + return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); +} + +// ------------------------------ Concat full (InterleaveLower) + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint64_t, decltype(d)> d64; + return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint64_t, decltype(d)> d64; + return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) { + return CombineShiftRightBytes<8>(d, hi, lo); +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatUpperLower(D /*d*/, Vec128<T> hi, Vec128<T> lo) { + const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7, + 24, 25, 26, 27, 28, 29, 30, 31}; + return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)}; +} + +// ------------------------------ Concat partial (Combine, LowerHalf) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); +} + +// ------------------------------ TruncateTo + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), + hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr, + HWY_IF_LANES_D(D, 1)> +HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<FromT, 1> v) { + using Raw = typename detail::Raw128<TFromD<D>>::type; +#if HWY_IS_LITTLE_ENDIAN + return VFromD<D>{reinterpret_cast<Raw>(v.raw)}; +#else + return VFromD<D>{reinterpret_cast<Raw>( + vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))}; +#endif +} + +namespace detail { + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), + HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)> +HWY_API VFromD<D> Truncate2To( + D /* tag */, Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo, + Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) { + return VFromD<D>{vec_pack(lo.raw, hi.raw)}; +} + +} // namespace detail + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), + HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)> +HWY_API VFromD<D> TruncateTo(D /* d */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + return VFromD<D>{vec_pack(v.raw, v.raw)}; +} + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), + hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr, + HWY_IF_LANES_GT_D(D, 1)> +HWY_API VFromD<D> TruncateTo(D d, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + const Rebind<MakeNarrow<FromT>, decltype(d)> d2; + return TruncateTo(d, TruncateTo(d2, v)); +} + +// ------------------------------ ConcatOdd (TruncateTo) + +// 8-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint16_t, decltype(d)> dw; + const RebindToUnsigned<decltype(d)> du; +#if HWY_IS_LITTLE_ENDIAN + // Right-shift 8 bits per u16 so we can pack. + const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); +#else + const Vec128<uint16_t> uH = BitCast(dw, hi); + const Vec128<uint16_t> uL = BitCast(dw, lo); +#endif + return BitCast(d, detail::Truncate2To(du, uL, uH)); +} + +// 8-bit x8 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half, no need to zero. + const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23}; + return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatOdd(D /*d*/, Vec32<T> hi, Vec32<T> lo) { + // Don't care about upper half, no need to zero. + const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19}; + return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; +} + +// 16-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint32_t, decltype(d)> dw; + const RebindToUnsigned<decltype(d)> du; +#if HWY_IS_LITTLE_ENDIAN + const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo)); +#else + const Vec128<uint32_t> uH = BitCast(dw, hi); + const Vec128<uint32_t> uL = BitCast(dw, lo); +#endif + return BitCast(d, detail::Truncate2To(du, uL, uH)); +} + +// 16-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half, no need to zero. + const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23}; + return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)}; +} + +// 32-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { +#if HWY_IS_LITTLE_ENDIAN + (void)d; + const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15, + 20, 21, 22, 23, 28, 29, 30, 31}; + return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)}; +#else + const RebindToUnsigned<decltype(d)> du; + const Repartition<uint64_t, decltype(d)> dw; + return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); +#endif +} + +// Any type x2 +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (TruncateTo) + +// 8-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint16_t, decltype(d)> dw; + const RebindToUnsigned<decltype(d)> du; +#if HWY_IS_LITTLE_ENDIAN + const Vec128<uint16_t> uH = BitCast(dw, hi); + const Vec128<uint16_t> uL = BitCast(dw, lo); +#else + // Right-shift 8 bits per u16 so we can pack. + const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); +#endif + return BitCast(d, detail::Truncate2To(du, uL, uH)); +} + +// 8-bit x8 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half, no need to zero. + const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22}; + return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatEven(D /*d*/, Vec32<T> hi, Vec32<T> lo) { + // Don't care about upper half, no need to zero. + const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18}; + return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; +} + +// 16-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { + // Isolate lower 16 bits per u32 so we can pack. + const Repartition<uint32_t, decltype(d)> dw; + const RebindToUnsigned<decltype(d)> du; +#if HWY_IS_LITTLE_ENDIAN + const Vec128<uint32_t> uH = BitCast(dw, hi); + const Vec128<uint32_t> uL = BitCast(dw, lo); +#else + const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo)); +#endif + return BitCast(d, detail::Truncate2To(du, uL, uH)); +} + +// 16-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half, no need to zero. + const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21}; + return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)}; +} + +// 32-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { +#if HWY_IS_LITTLE_ENDIAN + const Repartition<uint64_t, decltype(d)> dw; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); +#else + (void)d; + constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11, + 16, 17, 18, 19, 24, 25, 26, 27}; + return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)}; +#endif +} + +// Any T x2 +template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd) +#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#else +#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO +#endif + +template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<D> OrderedTruncate2To(D d, V a, V b) { +#if HWY_IS_LITTLE_ENDIAN + return ConcatEven(d, BitCast(d, b), BitCast(d, a)); +#else + return ConcatOdd(d, BitCast(d, b), BitCast(d, a)); +#endif +} + +// ------------------------------ DupEven (InterleaveLower) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { + return Vec128<T, N>{vec_mergee(v.raw, v.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { + return InterleaveLower(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { + return Vec128<T, N>{vec_mergeo(v.raw, v.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { + return InterleaveUpper(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ OddEven (IfThenElse) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; + return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0}; + return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { + // Same as ConcatUpperLower for full vectors; do not call that because this + // is more efficient for 64x1 vectors. + const DFromV<decltype(a)> d; + const __vector unsigned char mask = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0}; + return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a); +} + +// ------------------------------ OddEvenBlocks +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template <typename T, size_t N> +HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { + return v; +} + +// ------------------------------ Shl + +namespace detail { +template <typename T, size_t N> +HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v, + Vec128<T, N> bits) { + return Vec128<T, N>{vec_sl(v.raw, bits.raw)}; +} + +// Signed left shift is the same as unsigned. +template <typename T, size_t N> +HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v, + Vec128<T, N> bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + return BitCast(di, + Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); +} + +} // namespace detail + +template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { + return detail::Shl(hwy::TypeTag<T>(), v, bits); +} + +// ------------------------------ Shr + +namespace detail { +template <typename T, size_t N> +HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v, + Vec128<T, N> bits) { + return Vec128<T, N>{vec_sr(v.raw, bits.raw)}; +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v, + Vec128<T, N> bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) { + return detail::Shr(hwy::TypeTag<T>(), v, bits); +} + +// ------------------------------ MulEven/Odd 64x64 (UpperHalf) + +HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) { +#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) + using VU64 = __vector unsigned long long; + const VU64 mul128_result = reinterpret_cast<VU64>(vec_mule(a.raw, b.raw)); +#if HWY_IS_LITTLE_ENDIAN + return Vec128<uint64_t>{mul128_result}; +#else + // Need to swap the two halves of mul128_result on big-endian targets as + // the upper 64 bits of the product are in lane 0 of mul128_result and + // the lower 64 bits of the product are in lane 1 of mul128_result + return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)}; +#endif +#else + alignas(16) uint64_t mul[2]; + mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); + return Load(Full128<uint64_t>(), mul); +#endif +} + +HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) { +#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) + using VU64 = __vector unsigned long long; + const VU64 mul128_result = reinterpret_cast<VU64>(vec_mulo(a.raw, b.raw)); +#if HWY_IS_LITTLE_ENDIAN + return Vec128<uint64_t>{mul128_result}; +#else + // Need to swap the two halves of mul128_result on big-endian targets as + // the upper 64 bits of the product are in lane 0 of mul128_result and + // the lower 64 bits of the product are in lane 1 of mul128_result + return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)}; +#endif +#else + alignas(16) uint64_t mul[2]; + const Full64<uint64_t> d2; + mul[0] = + Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); + return Load(Full128<uint64_t>(), mul); +#endif +} + +// ------------------------------ WidenMulPairwiseAdd + +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { + const RebindToUnsigned<decltype(df32)> du32; + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip + // leads to the odd/even order that RearrangeToOddPlusEven prefers. + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return Mul(BitCast(df32, ae), BitCast(df32, be)) + Mul(BitCast(df32, ao), BitCast(df32, bo)); +} + +// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. +template <class D32, HWY_IF_I32_D(D32), + class V16 = VFromD<RepartitionToNarrow<D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { + return VFromD<D32>{a * b}; +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, + VFromD<D32> sum0, + VFromD<D32>& sum1) { + const RebindToUnsigned<decltype(df32)> du32; + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip + // leads to the odd/even order that RearrangeToOddPlusEven prefers. + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. +template <class D32, HWY_IF_I32_D(D32), + class V16 = VFromD<RepartitionToNarrow<D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b, + VFromD<D32> sum0, + VFromD<D32>& /*sum1*/) { + return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)}; +} + +// ------------------------------ RearrangeToOddPlusEven +template <size_t N> +HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(Vec128<int32_t, N> sum0, + Vec128<int32_t, N> /*sum1*/) { + return sum0; // invariant already holds +} + +template <class VW> +HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { + return Add(sum0, sum1); +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned to signed/unsigned: zero-extend. +template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)), + HWY_IF_NOT_FLOAT_D(D), HWY_IF_UNSIGNED(FromT)> +HWY_API VFromD<D> PromoteTo(D /* d */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + // First pretend the input has twice the lanes - the upper half will be + // ignored by ZipLower. + const Rebind<FromT, Twice<D>> d2; + const VFromD<decltype(d2)> twice{v.raw}; + // Then cast to narrow as expected by ZipLower, in case the sign of FromT + // differs from that of D. + const RepartitionToNarrow<D> dn; + +#if HWY_IS_LITTLE_ENDIAN + return ZipLower(BitCast(dn, twice), Zero(dn)); +#else + return ZipLower(Zero(dn), BitCast(dn, twice)); +#endif +} + +// Signed: replicate sign bit. +template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)), + HWY_IF_NOT_FLOAT_D(D), HWY_IF_SIGNED(FromT)> +HWY_API VFromD<D> PromoteTo(D /* d */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + using Raw = typename detail::Raw128<TFromD<D>>::type; + return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))}; +} + +// 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit. +template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D), + HWY_IF_T_SIZE(FromT, 1)> +HWY_API VFromD<D> PromoteTo(D d32, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + const DFromV<decltype(v)> d8; + const Rebind<MakeWide<FromT>, decltype(d8)> d16; + return PromoteTo(d32, PromoteTo(d16, v)); +} + +// 8-bit or 16-bit to 64-bit: First, promote to MakeWide<FromT>, and then +// convert to 64-bit. +template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(FromT), + HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> PromoteTo(D d64, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + const Rebind<MakeWide<FromT>, decltype(d64)> dw; + return PromoteTo(d64, PromoteTo(dw, v)); +} + +// Workaround for origin tracking bug in Clang msan prior to 11.0 +// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") +#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) +#define HWY_INLINE_F16 HWY_NOINLINE +#else +#define HWY_INLINE_F16 HWY_INLINE +#endif +template <class D, HWY_IF_F32_D(D)> +HWY_INLINE_F16 VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) { +#if HWY_PPC_HAVE_9 + (void)df32; + return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)}; +#else + const RebindToSigned<decltype(df32)> di32; + const RebindToUnsigned<decltype(df32)> du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, VFromD<Rebind<uint16_t, D>>{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +#endif +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { + const __vector float raw_v = InterleaveLower(v, v).raw; +#if HWY_IS_LITTLE_ENDIAN + return VFromD<D>{vec_doubleo(raw_v)}; +#else + return VFromD<D>{vec_doublee(raw_v)}; +#endif +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const __vector signed int raw_v = InterleaveLower(v, v).raw; +#if HWY_IS_LITTLE_ENDIAN + return VFromD<D>{vec_doubleo(raw_v)}; +#else + return VFromD<D>{vec_doublee(raw_v)}; +#endif +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)> +HWY_API VFromD<D> DemoteTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + return VFromD<D>{vec_packsu(v.raw, v.raw)}; +} + +template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)> +HWY_API VFromD<D> DemoteTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + return VFromD<D>{vec_packs(v.raw, v.raw)}; +} + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)> +HWY_API VFromD<D> DemoteTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + return VFromD<D>{vec_packs(v.raw, v.raw)}; +} + +template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), + hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr> +HWY_API VFromD<D> DemoteTo(D d, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + const Rebind<MakeNarrow<FromT>, D> d2; + return DemoteTo(d, DemoteTo(d2, v)); +} + +template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), + hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr> +HWY_API VFromD<D> DemoteTo(D d, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + const Rebind<MakeNarrow<FromT>, D> d2; + return DemoteTo(d, DemoteTo(d2, v)); +} + +template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), + hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr> +HWY_API VFromD<D> DemoteTo(D d, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2; + return DemoteTo(d, DemoteTo(d2, v)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> +HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { +#if HWY_PPC_HAVE_9 && HWY_COMPILER_GCC_ACTUAL + // Do not use vec_pack_to_short_fp32 on clang as there is a bug in the clang + // version of vec_pack_to_short_fp32 + (void)df16; + return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)}; +#else + const Rebind<uint32_t, decltype(df16)> du; + const RebindToUnsigned<decltype(df16)> du16; +#if HWY_PPC_HAVE_9 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp) + // Work around bug in the clang implementation of vec_pack_to_short_fp32 + // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets + // if the __builtin_vsx_xvcvsphp intrinsic is available + const VFromD<decltype(du)> bits16{ + reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))}; +#else + const RebindToSigned<decltype(du)> di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(RebindMask(du, is_tiny), normal16); +#endif // HWY_PPC_HAVE_9 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp) + return BitCast(df16, TruncateTo(du16, bits16)); +#endif // HWY_PPC_HAVE_9 && HWY_COMPILER_GCC_ACTUAL +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)> +HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { + const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right + const Rebind<uint16_t, decltype(dbf16)> du16; + const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v)); + return BitCast(dbf16, TruncateTo(du16, bits_in_32)); +} + +template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) { + const RebindToUnsigned<decltype(dbf16)> du16; + const Repartition<uint32_t, decltype(dbf16)> du32; +#if HWY_IS_LITTLE_ENDIAN + const auto a_in_odd = a; + const auto b_in_even = ShiftRight<16>(BitCast(du32, b)); +#else + const auto a_in_odd = ShiftRight<16>(BitCast(du32, a)); + const auto b_in_even = b; +#endif + return BitCast(dbf16, + OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even))); +} + +// Specializations for partial vectors because vec_packs sets lanes above 2*N. +template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN), + HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN), + HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN), + HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) { + return VFromD<DN>{vec_packs(a.raw, b.raw)}; +} + +template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), + HWY_IF_UNSIGNED_D(DN), HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN), + HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN), + HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) { + return VFromD<DN>{vec_packsu(a.raw, b.raw)}; +} + +template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), + HWY_IF_UNSIGNED_D(DN), HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN), + HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN), + HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) { + return VFromD<DN>{vec_packs(a.raw, b.raw)}; +} + +template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V, + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + return ReorderDemote2To(d, a, b); +} + +template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) { + const RebindToUnsigned<decltype(dbf16)> du16; +#if HWY_IS_LITTLE_ENDIAN + return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); +#else + return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a))); +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> +HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) { + return Vec32<float>{vec_floate(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) { +#if HWY_IS_LITTLE_ENDIAN + const Vec128<float> f64_to_f32{vec_floate(v.raw)}; +#else + const Vec128<float> f64_to_f32{vec_floato(v.raw)}; +#endif + + const RebindToUnsigned<D> du; + const Rebind<uint64_t, D> du64; + return Vec64<float>{ + BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)> +HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) { + return Vec32<int32_t>{vec_signede(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) { +#if HWY_IS_LITTLE_ENDIAN + const Vec128<int32_t> f64_to_i32{vec_signede(v.raw)}; +#else + const Vec128<int32_t> f64_to_i32{vec_signedo(v.raw)}; +#endif + + const Rebind<int64_t, D> di64; + const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32); + return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)}; +} + +// For already range-limited input [0, 255]. +template <size_t N> +HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) { + const Rebind<uint16_t, DFromV<decltype(v)>> du16; + const Rebind<uint8_t, decltype(du16)> du8; + return TruncateTo(du8, TruncateTo(du16, v)); +} +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + +// Note: altivec.h vec_ct* currently contain C casts which triggers +// -Wdeprecate-lax-vec-conv-all warnings, so disable them. + +template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_NOT_FLOAT(FromT), + HWY_IF_T_SIZE_D(D, sizeof(FromT))> +HWY_API VFromD<D> ConvertTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_CLANG + HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") +#endif + return VFromD<D>{vec_ctf(v.raw, 0)}; + HWY_DIAGNOSTICS(pop) +} + +template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT), + HWY_IF_T_SIZE_D(D, sizeof(FromT))> +HWY_API VFromD<D> ConvertTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + return VFromD<D>{vec_double(v.raw)}; +} + +// Truncates (rounds toward zero). +template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_FLOAT(FromT), + HWY_IF_T_SIZE_D(D, sizeof(FromT))> +HWY_API VFromD<D> ConvertTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_CLANG + HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") +#endif + return VFromD<D>{vec_cts(v.raw, 0)}; + HWY_DIAGNOSTICS(pop) +} + +template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_FLOAT(FromT), + HWY_IF_T_SIZE_D(D, sizeof(FromT))> +HWY_API VFromD<D> ConvertTo(D /* tag */, + Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { + HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_CLANG + HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") +#endif + return VFromD<D>{vec_ctu(v.raw, 0)}; + HWY_DIAGNOSTICS(pop) +} + +template <size_t N> +HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) { + HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_CLANG + HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") +#endif + return Vec128<int32_t, N>{vec_cts(vec_round(v.raw), 0)}; + HWY_DIAGNOSTICS(pop) +} + +// ------------------------------ Floating-point rounding (ConvertTo) + +// Toward nearest integer, ties to even +template <size_t N> +HWY_API Vec128<float, N> Round(Vec128<float, N> v) { + return Vec128<float, N>{vec_round(v.raw)}; +} + +template <size_t N> +HWY_API Vec128<double, N> Round(Vec128<double, N> v) { + return Vec128<double, N>{vec_rint(v.raw)}; +} + +// Toward zero, aka truncate +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) { + return Vec128<T, N>{vec_trunc(v.raw)}; +} + +// Toward +infinity, aka ceiling +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Ceil(Vec128<T, N> v) { + return Vec128<T, N>{vec_ceil(v.raw)}; +} + +// Toward -infinity, aka floor +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Floor(Vec128<T, N> v) { + return Vec128<T, N>{vec_floor(v.raw)}; +} + +// ------------------------------ Floating-point classification + +template <typename T, size_t N> +HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + return v != v; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + using TU = MakeUnsigned<T>; + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask( + d, + Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>())))); +} + +// Returns whether normal/subnormal/zero. +template <typename T, size_t N> +HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + using TU = MakeUnsigned<T>; + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, check for exponent<max. + return RebindMask( + d, + Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>())))); +} + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PPC8_CRYPTO) + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +namespace detail { +#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600 +using CipherTag = Full128<uint64_t>; +#else +using CipherTag = Full128<uint8_t>; +#endif // !HWY_COMPILER_CLANG +using CipherVec = VFromD<CipherTag>; +} // namespace detail + +HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + const detail::CipherTag dc; + const Full128<uint8_t> du8; +#if HWY_IS_LITTLE_ENDIAN + return Reverse(du8, + BitCast(du8, detail::CipherVec{vec_cipher_be( + BitCast(dc, Reverse(du8, state)).raw, + BitCast(dc, Reverse(du8, round_key)).raw)})); +#else + return BitCast(du8, detail::CipherVec{vec_cipher_be( + BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); +#endif +} + +HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + const detail::CipherTag dc; + const Full128<uint8_t> du8; +#if HWY_IS_LITTLE_ENDIAN + return Reverse(du8, + BitCast(du8, detail::CipherVec{vec_cipherlast_be( + BitCast(dc, Reverse(du8, state)).raw, + BitCast(dc, Reverse(du8, round_key)).raw)})); +#else + return BitCast(du8, detail::CipherVec{vec_cipherlast_be( + BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); +#endif +} + +HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + const detail::CipherTag dc; + const Full128<uint8_t> du8; +#if HWY_IS_LITTLE_ENDIAN + return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be( + BitCast(dc, Reverse(du8, state)).raw, + Zero(dc).raw)})), + round_key); +#else + return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be( + BitCast(dc, state).raw, Zero(dc).raw)}), + round_key); +#endif +} + +HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + const detail::CipherTag dc; + const Full128<uint8_t> du8; +#if HWY_IS_LITTLE_ENDIAN + return Reverse(du8, + BitCast(du8, detail::CipherVec{vec_ncipherlast_be( + BitCast(dc, Reverse(du8, state)).raw, + BitCast(dc, Reverse(du8, round_key)).raw)})); +#else + return BitCast(du8, detail::CipherVec{vec_ncipherlast_be( + BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); +#endif +} + +HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { + const Full128<uint8_t> du8; + const auto zero = Zero(du8); + + // PPC8/PPC9/PPC10 does not have a single instruction for the AES + // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do. + + // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10 + // by doing an AESLastRound operation with a zero round_key followed by an + // AESRoundInv operation with a zero round_key. + return AESRoundInv(AESLastRound(state, zero), zero); +} + +template <uint8_t kRcon> +HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { + constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0, + 0, 0, 0, 0, kRcon, 0, 0, 0}; + constexpr __vector unsigned char kRotWordShuffle = { + 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12}; + const detail::CipherTag dc; + const Full128<uint8_t> du8; + const auto sub_word_result = + BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)}); + const auto rot_word_result = + TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle}); + return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask}); +} + +template <size_t N> +HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + // NOTE: Lane 1 of both a and b need to be zeroed out for the + // vec_pmsum_be operation below as the vec_pmsum_be operation + // does a carryless multiplication of each 64-bit half and then + // adds the two halves using an bitwise XOR operation. + + const DFromV<decltype(a)> d; + const auto zero = Zero(d); + + using VU64 = __vector unsigned long long; + const VU64 pmsum_result = reinterpret_cast<VU64>( + vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw)); + +#if HWY_IS_LITTLE_ENDIAN + return Vec128<uint64_t, N>{pmsum_result}; +#else + // Need to swap the two halves of pmsum_result on big-endian targets as + // the upper 64 bits of the carryless multiplication result are in lane 0 of + // pmsum_result and the lower 64 bits of the carryless multiplication result + // are in lane 1 of mul128_result + return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)}; +#endif +} + +template <size_t N> +HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + // NOTE: Lane 0 of both a and b need to be zeroed out for the + // vec_pmsum_be operation below as the vec_pmsum_be operation + // does a carryless multiplication of each 64-bit half and then + // adds the two halves using an bitwise XOR operation. + + const DFromV<decltype(a)> d; + const auto zero = Zero(d); + + using VU64 = __vector unsigned long long; + const VU64 pmsum_result = reinterpret_cast<VU64>( + vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw))); + +#if HWY_IS_LITTLE_ENDIAN + return Vec128<uint64_t, N>{pmsum_result}; +#else + // Need to swap the two halves of pmsum_result on big-endian targets as + // the upper 64 bits of the carryless multiplication result are in lane 0 of + // pmsum_result and the lower 64 bits of the carryless multiplication result + // are in lane 1 of mul128_result + return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)}; +#endif +} + +#endif // !defined(HWY_DISABLE_PPC8_CRYPTO) + +// ================================================== MISC + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { +#if HWY_PPC_HAVE_10 + const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)}; + +#if HWY_IS_LITTLE_ENDIAN + return MFromD<D>{MaskFromVec(mask_vec).raw}; +#else + return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw}; +#endif // HWY_IS_LITTLE_ENDIAN + +#else // PPC9 or earlier + const Full128<uint8_t> du8; + const Full128<uint16_t> du16; + const Vec128<uint8_t> vbits = + BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))); + + // Replicate bytes 8x such that each byte contains the bit that governs it. +#if HWY_IS_LITTLE_ENDIAN + const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; +#else + const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0}; +#endif // HWY_IS_LITTLE_ENDIAN + + const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)}; + const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw}; +#endif // HWY_PPC_HAVE_10 +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { +#if HWY_PPC_HAVE_10 + const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)}; + +#if HWY_IS_LITTLE_ENDIAN + return MFromD<D>{MaskFromVec(mask_vec).raw}; +#else + return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw}; +#endif // HWY_IS_LITTLE_ENDIAN + +#else // PPC9 or earlier + const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = + Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits)); + return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw}; +#endif // HWY_PPC_HAVE_10 +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { +#if HWY_PPC_HAVE_10 + const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)}; + +#if HWY_IS_LITTLE_ENDIAN + return MFromD<D>{MaskFromVec(mask_vec).raw}; +#else + return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw}; +#endif // HWY_IS_LITTLE_ENDIAN + +#else // PPC9 or earlier + const __vector unsigned int kBit = {1, 2, 4, 8}; + const auto vmask_bits = + Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits)); + return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw}; +#endif // HWY_PPC_HAVE_10 +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { +#if HWY_PPC_HAVE_10 + const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)}; + +#if HWY_IS_LITTLE_ENDIAN + return MFromD<D>{MaskFromVec(mask_vec).raw}; +#else + return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw}; +#endif // HWY_IS_LITTLE_ENDIAN + +#else // PPC9 or earlier + const __vector unsigned long long kBit = {1, 2}; + const auto vmask_bits = + Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits)); + return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw}; +#endif // HWY_PPC_HAVE_10 +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_LANES_LE_D(D, 8)> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t + uint64_t mask_bits = bits[0]; + + constexpr size_t kN = MaxLanes(d); + if (kN < 8) mask_bits &= (1u << kN) - 1; + + return detail::LoadMaskBits128(d, mask_bits); +} + +template <class D, HWY_IF_LANES_D(D, 16)> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + // First, copy the mask bits to a uint16_t as there as there are at most + // 16 lanes in a vector. + + // Copying the mask bits to a uint16_t first will also ensure that the + // mask bits are loaded into the lower 16 bits on big-endian PPC targets. + uint16_t u16_mask_bits; + CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits); + +#if HWY_IS_LITTLE_ENDIAN + return detail::LoadMaskBits128(d, u16_mask_bits); +#else + // On big-endian targets, u16_mask_bits need to be byte swapped as bits + // contains the mask bits in little-endian byte order + + // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a + // single lhbrx instruction on big-endian PPC targets when optimizations + // are enabled. +#if HWY_HAS_BUILTIN(__builtin_bswap16) + return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits)); +#else + return detail::LoadMaskBits128( + d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8))); +#endif +#endif +} + +template <typename T> +struct CompressIsPartition { + // generic_ops-inl does not guarantee IsPartition for 8-bit. + enum { value = (sizeof(T) != 1) }; +}; + +// ------------------------------ StoreMaskBits + +namespace detail { + +#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN +// fallback for missing vec_extractm +template <size_t N> +HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits, + __vector unsigned char bit_shuffle) { + // clang POWER8 and 9 targets appear to differ in their return type of + // vec_vbpermq: unsigned or signed, so cast to avoid a warning. + using VU64 = detail::Raw128<uint64_t>::type; + const Vec128<uint64_t> extracted{ + reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))}; + return extracted.raw[HWY_IS_LITTLE_ENDIAN]; +} + +#endif // !HWY_PPC_HAVE_10 + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) { + const DFromM<decltype(mask)> d; + const Repartition<uint8_t, decltype(d)> du8; + const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); +#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN + return static_cast<uint64_t>(vec_extractm(sign_bits.raw)); +#else + const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, + 56, 48, 40, 32, 24, 16, 8, 0}; + return ExtractSignBits(sign_bits, kBitShuffle); +#endif // HWY_PPC_HAVE_10 +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) { + const DFromM<decltype(mask)> d; + const Repartition<uint8_t, decltype(d)> du8; + const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); + +#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN + const RebindToUnsigned<decltype(d)> du; + return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)); +#else +#if HWY_IS_LITTLE_ENDIAN + const __vector unsigned char kBitShuffle = { + 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128}; +#else + const __vector unsigned char kBitShuffle = { + 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; +#endif + return ExtractSignBits(sign_bits, kBitShuffle); +#endif // HWY_PPC_HAVE_10 +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) { + const DFromM<decltype(mask)> d; + const Repartition<uint8_t, decltype(d)> du8; + const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); +#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN + const RebindToUnsigned<decltype(d)> du; + return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)); +#else +#if HWY_IS_LITTLE_ENDIAN + const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128}; +#else + const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 96, 64, 32, 0}; +#endif + return ExtractSignBits(sign_bits, kBitShuffle); +#endif // HWY_PPC_HAVE_10 +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) { + const DFromM<decltype(mask)> d; + const Repartition<uint8_t, decltype(d)> du8; + const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); +#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN + const RebindToUnsigned<decltype(d)> du; + return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)); +#else +#if HWY_IS_LITTLE_ENDIAN + const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128}; +#else + const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 64, 0}; +#endif + return ExtractSignBits(sign_bits, kBitShuffle); +#endif // HWY_PPC_HAVE_10 +} + +// Returns the lowest N of the mask bits. +template <typename T, size_t N> +constexpr uint64_t OnlyActive(uint64_t mask_bits) { + return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) { + return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask)); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template <class D, HWY_IF_LANES_LE_D(D, 8)> +HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) { + // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask + // to an uint8_t and store the result in bits[0]. + bits[0] = static_cast<uint8_t>(detail::BitsFromMask(mask)); + return sizeof(uint8_t); +} + +template <class D, HWY_IF_LANES_D(D, 16)> +HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) { + const auto mask_bits = detail::BitsFromMask(mask); + + // First convert mask_bits to a uint16_t as we only want to store + // the lower 16 bits of mask_bits as there are 16 lanes in mask. + + // Converting mask_bits to a uint16_t first will also ensure that + // the lower 16 bits of mask_bits are stored instead of the upper 16 bits + // of mask_bits on big-endian PPC targets. +#if HWY_IS_LITTLE_ENDIAN + const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits); +#else + // On big-endian targets, the bytes of mask_bits need to be swapped + // as StoreMaskBits expects the mask bits to be stored in little-endian + // byte order. + + // GCC will also optimize the byte swap and CopyBytes operations below + // to a single sthbrx instruction when optimizations are enabled on + // big-endian PPC targets +#if HWY_HAS_BUILTIN(__builtin_bswap16) + const uint16_t u16_mask_bits = + __builtin_bswap16(static_cast<uint16_t>(mask_bits)); +#else + const uint16_t u16_mask_bits = static_cast<uint16_t>( + (mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8)); +#endif +#endif + + CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits); + return sizeof(uint16_t); +} + +// ------------------------------ Mask testing + +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_API bool AllFalse(D d, MFromD<D> mask) { + const RebindToUnsigned<decltype(d)> du; + return static_cast<bool>(vec_all_eq(RebindMask(du, mask).raw, Zero(du).raw)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_API bool AllTrue(D d, MFromD<D> mask) { + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + return static_cast<bool>( + vec_all_eq(RebindMask(du, mask).raw, Set(du, hwy::LimitsMax<TU>()).raw)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API bool AllFalse(D d, MFromD<D> mask) { + const Full128<TFromD<D>> d_full; + constexpr size_t kN = MaxLanes(d); + return AllFalse(d_full, MFromD<decltype(d_full)>{ + vec_and(mask.raw, FirstN(d_full, kN).raw)}); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API bool AllTrue(D d, MFromD<D> mask) { + const Full128<TFromD<D>> d_full; + constexpr size_t kN = MaxLanes(d); + return AllTrue(d_full, MFromD<decltype(d_full)>{ + vec_or(mask.raw, Not(FirstN(d_full, kN)).raw)}); +} + +template <class D> +HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) { + return PopCount(detail::BitsFromMask(mask)); +} + +template <class D> +HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) { + return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask)); +} + +template <class D> +HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; +} + +template <class D> +HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) { + return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask)); +} + +template <class D> +HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) + : -1; +} + +// ------------------------------ Compress, CompressBits + +namespace detail { + +// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Rebind<uint8_t, decltype(d)> d8; + const Twice<decltype(d8)> d8t; + const RebindToUnsigned<decltype(d)> du; + + // To reduce cache footprint, store lane indices and convert to byte indices + // (2*lane + 0..1), with the doubling baked into the table. It's not clear + // that the additional cost of unpacking nibbles is worthwhile. + alignas(16) static constexpr uint8_t table[2048] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; + const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); + constexpr uint16_t kPairIndexIncrement = + HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; + + return BitCast(d, pairs + Set(du, kPairIndexIncrement)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Rebind<uint8_t, decltype(d)> d8; + const Twice<decltype(d8)> d8t; + const RebindToUnsigned<decltype(d)> du; + + // To reduce cache footprint, store lane indices and convert to byte indices + // (2*lane + 0..1), with the doubling baked into the table. It's not clear + // that the additional cost of unpacking nibbles is worthwhile. + alignas(16) static constexpr uint8_t table[2048] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; + const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); + constexpr uint16_t kPairIndexIncrement = + HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; + + return BitCast(d, pairs + Set(du, kPairIndexIncrement)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[256] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[256] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[64] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[64] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + + HWY_DASSERT(mask_bits < (1ull << N)); + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + + HWY_DASSERT(mask_bits < (1ull << N)); + const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); + return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); +} + +} // namespace detail + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const Full128<T> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 bytes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { + return detail::CompressBits(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressNot + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const Full128<T> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 bytes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNotBits(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, + Mask128<uint64_t> /* m */) { + return v; +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, + const uint8_t* HWY_RESTRICT bits) { + // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply + // convert bits[0] to a uint64_t + uint64_t mask_bits = bits[0]; + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::CompressBits(v, mask_bits); +} + +// ------------------------------ CompressStore, CompressBitsStore + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + + const uint64_t mask_bits = detail::BitsFromMask(m); + HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); + const size_t count = PopCount(mask_bits); + + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + StoreU(compressed, d, unaligned); + return count; +} + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + + const uint64_t mask_bits = detail::BitsFromMask(m); + HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); + const size_t count = PopCount(mask_bits); + + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + BlendedStore(compressed, FirstN(d, count), d, unaligned); + return count; +} + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + + // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply + // convert bits[0] to a uint64_t + uint64_t mask_bits = bits[0]; + constexpr size_t kN = MaxLanes(d); + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + const size_t count = PopCount(mask_bits); + + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + StoreU(compressed, d, unaligned); + + return count; +} + +// ------------------------------ StoreInterleaved2/3/4 + +// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in +// generic_ops-inl.h. + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template <typename T> +HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) { + // NOTE: AltivecVsum2sws cannot be used here as AltivecVsum2sws + // computes the signed saturated sum of the lanes. + return v10 + Shuffle2301(v10); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) { + return Min(v10, Shuffle2301(v10)); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) { + return Max(v10, Shuffle2301(v10)); +} + +// N=4 (full) +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v3210) { + // NOTE: AltivecVsumsws cannot be used here as AltivecVsumsws + // computes the signed saturated sum of the lanes. + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = v3210 + v1032; + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = Min(v3210, v1032); + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = Max(v3210, v1032); + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// u64/i64/f64: + +// N=2 (full) +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return v10 + v01; +} +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return Min(v10, v01); +} +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return Max(v10, v01); +} + +// Casts nominally int32_t result to D. +template <class D> +HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a, + __vector signed int b) { + const Repartition<int32_t, D> di32; +#ifdef __OPTIMIZE__ + if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { + const int64_t sum0 = static_cast<int64_t>(a[0]) + + static_cast<int64_t>(a[1]) + + static_cast<int64_t>(b[0]); + const int64_t sum1 = static_cast<int64_t>(a[2]) + + static_cast<int64_t>(a[3]) + + static_cast<int64_t>(b[1]); + const int64_t sum2 = static_cast<int64_t>(a[4]) + + static_cast<int64_t>(a[5]) + + static_cast<int64_t>(b[2]); + const int64_t sum3 = static_cast<int64_t>(a[6]) + + static_cast<int64_t>(a[7]) + + static_cast<int64_t>(b[3]); + const int32_t sign0 = static_cast<int32_t>(sum0 >> 63); + const int32_t sign1 = static_cast<int32_t>(sum1 >> 63); + const int32_t sign2 = static_cast<int32_t>(sum2 >> 63); + const int32_t sign3 = static_cast<int32_t>(sum3 >> 63); + using Raw = typename detail::Raw128<int32_t>::type; + return BitCast( + d, + VFromD<decltype(di32)>{Raw{ + (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0) + : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF), + (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1) + : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF), + (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2) + : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF), + (sign3 == (sum3 >> 31)) + ? static_cast<int32_t>(sum3) + : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}}); + } else // NOLINT +#endif + { + return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)}); + } +} + +// Casts nominally int32_t result to D. +template <class D> +HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a, + __vector signed int b) { + const Repartition<int32_t, D> di32; +#ifdef __OPTIMIZE__ + if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { + const int64_t sum0 = + static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) + + static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) + + static_cast<int64_t>(b[0]); + const int64_t sum1 = + static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) + + static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) + + static_cast<int64_t>(b[1]); + const int64_t sum2 = + static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) + + static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) + + static_cast<int64_t>(b[2]); + const int64_t sum3 = + static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) + + static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) + + static_cast<int64_t>(b[3]); + const int32_t sign0 = static_cast<int32_t>(sum0 >> 63); + const int32_t sign1 = static_cast<int32_t>(sum1 >> 63); + const int32_t sign2 = static_cast<int32_t>(sum2 >> 63); + const int32_t sign3 = static_cast<int32_t>(sum3 >> 63); + using Raw = typename detail::Raw128<int32_t>::type; + return BitCast( + d, + VFromD<decltype(di32)>{Raw{ + (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0) + : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF), + (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1) + : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF), + (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2) + : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF), + (sign3 == (sum3 >> 31)) + ? static_cast<int32_t>(sum3) + : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}}); + } else // NOLINT +#endif + { + return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)}); + } +} + +// Casts nominally int32_t result to D. +template <class D> +HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a, + __vector signed int b) { + const Repartition<int32_t, D> di32; +#ifdef __OPTIMIZE__ + constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) { + const int64_t sum = + static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) + + static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) + + static_cast<int64_t>(b[kDestLaneOffset]); + const int32_t sign = static_cast<int32_t>(sum >> 63); +#if HWY_IS_LITTLE_ENDIAN + return BitCast( + d, VFromD<decltype(di32)>{(__vector signed int){ + (sign == (sum >> 31)) ? static_cast<int32_t>(sum) + : static_cast<int32_t>(sign ^ 0x7FFFFFFF), + 0, 0, 0}}); +#else + return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){ + 0, 0, 0, + (sign == (sum >> 31)) + ? static_cast<int32_t>(sum) + : static_cast<int32_t>(sign ^ 0x7FFFFFFF)}}); +#endif + } else // NOLINT +#endif + { + __vector signed int sum; + + // Inline assembly is used for vsumsws to avoid unnecessary shuffling + // on little-endian PowerPC targets as the result of the vsumsws + // instruction will already be in the correct lanes on little-endian + // PowerPC targets. + __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); + + return BitCast(d, VFromD<decltype(di32)>{sum}); + } +} + +template <size_t N> +HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) { + const RebindToSigned<DFromV<decltype(v)>> di16; + const RepartitionToWide<decltype(di16)> di32; + return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw, + Set(di32, 65536).raw); +} + +HWY_API Vec32<uint16_t> SumOfLanes(Vec32<uint16_t> v) { + constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; + DFromV<decltype(v)> du16; + return Broadcast<kSumLaneIdx>(BitCast(du16, AltivecU16SumsOf2(v))); +} + +HWY_API Vec64<uint16_t> SumOfLanes(Vec64<uint16_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + const Full64<uint16_t> du16; + const auto zero = Zero(Full128<int32_t>()); + return Broadcast<kSumLaneIdx>( + AltivecVsum2sws(du16, AltivecU16SumsOf2(v).raw, zero.raw)); +} + +HWY_API Vec128<uint16_t> SumOfLanes(Vec128<uint16_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; + const Full128<uint16_t> du16; + const auto zero = Zero(Full128<int32_t>()); + return Broadcast<kSumLaneIdx>( + AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw)); +} + +HWY_API Vec32<int16_t> SumOfLanes(Vec32<int16_t> v) { + constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; + const Full32<int16_t> di16; + const auto zero = Zero(Full128<int32_t>()); + return Broadcast<kSumLaneIdx>(AltivecVsum4shs(di16, v.raw, zero.raw)); +} + +HWY_API Vec64<int16_t> SumOfLanes(Vec64<int16_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + const Full128<int32_t> di32; + const Full64<int16_t> di16; + const auto zero = Zero(di32); + return Broadcast<kSumLaneIdx>(AltivecVsum2sws( + di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw)); +} + +HWY_API Vec128<int16_t> SumOfLanes(Vec128<int16_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; + const Full128<int16_t> di16; + const Full128<int32_t> di32; + const auto zero = Zero(di32); + return Broadcast<kSumLaneIdx>(AltivecVsumsws( + di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw)); +} + +// u8, N=2, N=4, N=8, N=16: +HWY_API Vec16<uint8_t> SumOfLanes(Vec16<uint8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + const Full16<uint8_t> du8; + const Full16<uint16_t> du16; + const Twice<decltype(du8)> dt_u8; + const Twice<decltype(du16)> dt_u16; + const Full128<uint32_t> du32; + return LowerHalf(Broadcast<kSumLaneIdx>(AltivecVsum4ubs( + dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw, + Zero(du32).raw))); +} + +HWY_API Vec32<uint8_t> SumOfLanes(Vec32<uint8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + const Full128<uint32_t> du32; + const Full32<uint8_t> du8; + return Broadcast<kSumLaneIdx>(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw)); +} + +HWY_API Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; + const Full64<uint8_t> du8; + return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v))); +} + +HWY_API Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; + + const Full128<uint32_t> du32; + const RebindToSigned<decltype(du32)> di32; + const Full128<uint8_t> du8; + const Vec128<uint32_t> zero = Zero(du32); + return Broadcast<kSumLaneIdx>( + AltivecVsumsws(du8, AltivecVsum4ubs(di32, v.raw, zero.raw).raw, + BitCast(di32, zero).raw)); +} + +HWY_API Vec16<int8_t> SumOfLanes(Vec16<int8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + + const Full128<uint16_t> du16; + const Repartition<int32_t, decltype(du16)> di32; + const Repartition<int8_t, decltype(du16)> di8; + const Vec128<int8_t> zzvv = BitCast( + di8, InterleaveLower(BitCast(du16, Vec128<int8_t>{v.raw}), Zero(du16))); + return Vec16<int8_t>{ + Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw)) + .raw}; +} + +HWY_API Vec32<int8_t> SumOfLanes(Vec32<int8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; + const Full32<int8_t> di8; + const Vec128<int32_t> zero = Zero(Full128<int32_t>()); + return Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, v.raw, zero.raw)); +} + +HWY_API Vec64<int8_t> SumOfLanes(Vec64<int8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; + const Full128<int32_t> di32; + const Vec128<int32_t> zero = Zero(di32); + const Full64<int8_t> di8; + return Broadcast<kSumLaneIdx>(AltivecVsum2sws( + di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw)); +} + +HWY_API Vec128<int8_t> SumOfLanes(Vec128<int8_t> v) { + constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; + const Full128<int8_t> di8; + const Full128<int32_t> di32; + const Vec128<int32_t> zero = Zero(di32); + return Broadcast<kSumLaneIdx>(AltivecVsumsws( + di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw)); +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)> +HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d16; + const RepartitionToWide<decltype(d16)> d32; + Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v)); + vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); + vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); + if (N > 8) { + const RepartitionToWide<decltype(d32)> d64; + vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); + } + return vm; +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)> +HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d16; + const RepartitionToWide<decltype(d16)> d32; + Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v)); + vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); + vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); + if (N > 8) { + const RepartitionToWide<decltype(d32)> d64; + vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); + } + return vm; +} + +template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)> +HWY_API Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d16; + const RepartitionToWide<decltype(d16)> d32; + Vec128<int8_t, N> vm = Max(v, Reverse2(d, v)); + vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); + vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); + if (N > 8) { + const RepartitionToWide<decltype(d32)> d64; + vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); + } + return vm; +} + +template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)> +HWY_API Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d16; + const RepartitionToWide<decltype(d16)> d32; + Vec128<int8_t, N> vm = Min(v, Reverse2(d, v)); + vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); + vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); + if (N > 8) { + const RepartitionToWide<decltype(d32)> d64; + vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); + } + return vm; +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_API Vec128<uint16_t, N> MinOfLanes(Vec128<uint16_t, N> v) { + const Simd<uint16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; +#if HWY_IS_LITTLE_ENDIAN + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); +#else + const auto even = ShiftRight<16>(BitCast(d32, v)); + const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF)); +#endif + const auto min = MinOfLanes(Min(even, odd)); + // Also broadcast into odd lanes on little-endian and into even lanes + // on big-endian + return Vec128<uint16_t, N>{vec_pack(min.raw, min.raw)}; +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_API Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) { + const Simd<int16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend +#if HWY_IS_LITTLE_ENDIAN + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); +#else + const auto even = ShiftRight<16>(BitCast(d32, v)); + const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); +#endif + const auto min = MinOfLanes(Min(even, odd)); + // Also broadcast into odd lanes on little-endian and into even lanes + // on big-endian + return Vec128<int16_t, N>{vec_pack(min.raw, min.raw)}; +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_API Vec128<uint16_t, N> MaxOfLanes(Vec128<uint16_t, N> v) { + const Simd<uint16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; +#if HWY_IS_LITTLE_ENDIAN + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); +#else + const auto even = ShiftRight<16>(BitCast(d32, v)); + const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF)); +#endif + const auto max = MaxOfLanes(Max(even, odd)); + // Also broadcast into odd lanes. + return Vec128<uint16_t, N>{vec_pack(max.raw, max.raw)}; +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_API Vec128<int16_t, N> MaxOfLanes(Vec128<int16_t, N> v) { + const Simd<int16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend +#if HWY_IS_LITTLE_ENDIAN + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); +#else + const auto even = ShiftRight<16>(BitCast(d32, v)); + const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); +#endif + const auto max = MaxOfLanes(Max(even, odd)); + // Also broadcast into odd lanes on little-endian and into even lanes + // on big-endian + return Vec128<int16_t, N>{vec_pack(max.raw, max.raw)}; +} + +} // namespace detail + +// Supported for u/i/f 32/64. Returns the same value in each lane. +template <class D> +HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) { + return detail::SumOfLanes(v); +} +template <class D> +HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) { + return GetLane(detail::SumOfLanes(v)); +} +template <class D> +HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) { + return detail::MinOfLanes(v); +} +template <class D> +HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) { + return detail::MaxOfLanes(v); +} + +// ------------------------------ Lt128 + +namespace detail { + +// Returns vector-mask for Lt128. +template <class D, class V = VFromD<D>> +HWY_INLINE V Lt128Vec(D d, V a, V b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); +#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) + (void)d; + using VU64 = __vector unsigned long long; + using VU128 = __vector unsigned __int128; +#if HWY_IS_LITTLE_ENDIAN + const VU128 a_u128 = reinterpret_cast<VU128>(a.raw); + const VU128 b_u128 = reinterpret_cast<VU128>(b.raw); +#else + // NOTE: Need to swap the halves of both a and b on big-endian targets + // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits + // of a and b are in lane 0 whereas the vec_cmplt operation below expects + // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on + // big-endian PPC targets. + const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8)); + const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8)); +#endif + return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))}; +#else // !HWY_PPC_HAVE_10 + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const auto eqHL = Eq(a, b); + const V ltHL = VecFromMask(d, Lt(a, b)); + const V ltLX = ShiftLeftLanes<1>(ltHL); + const V vecHx = IfThenElse(eqHL, ltLX, ltHL); + return InterleaveUpper(d, vecHx, vecHx); +#endif +} + +// Returns vector-mask for Eq128. +template <class D, class V = VFromD<D>> +HWY_INLINE V Eq128Vec(D d, V a, V b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); +#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) + (void)d; + using VU64 = __vector unsigned long long; + using VU128 = __vector unsigned __int128; + return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw), + reinterpret_cast<VU128>(b.raw)))}; +#else + const auto eqHL = VecFromMask(d, Eq(a, b)); + const auto eqLH = Reverse2(d, eqHL); + return And(eqHL, eqLH); +#endif +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Ne128Vec(D d, V a, V b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); +#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) + (void)d; + using VU64 = __vector unsigned long long; + using VU128 = __vector unsigned __int128; + return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw), + reinterpret_cast<VU128>(b.raw)))}; +#else + const auto neHL = VecFromMask(d, Ne(a, b)); + const auto neLH = Reverse2(d, neHL); + return Or(neHL, neLH); +#endif +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Lt128UpperVec(D d, V a, V b) { + const V ltHL = VecFromMask(d, Lt(a, b)); + return InterleaveUpper(d, ltHL, ltHL); +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Eq128UpperVec(D d, V a, V b) { + const V eqHL = VecFromMask(d, Eq(a, b)); + return InterleaveUpper(d, eqHL, eqHL); +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Ne128UpperVec(D d, V a, V b) { + const V neHL = VecFromMask(d, Ne(a, b)); + return InterleaveUpper(d, neHL, neHL); +} + +} // namespace detail + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Lt128(D d, V a, V b) { + return MaskFromVec(detail::Lt128Vec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Eq128(D d, V a, V b) { + return MaskFromVec(detail::Eq128Vec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Ne128(D d, V a, V b) { + return MaskFromVec(detail::Ne128Vec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Lt128Upper(D d, V a, V b) { + return MaskFromVec(detail::Lt128UpperVec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Eq128Upper(D d, V a, V b) { + return MaskFromVec(detail::Eq128UpperVec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Ne128Upper(D d, V a, V b) { + return MaskFromVec(detail::Ne128UpperVec(d, a, b)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Avoids the extra MaskFromVec in Lt128. +template <class D, class V = VFromD<D>> +HWY_API V Min128(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); +} + +template <class D, class V = VFromD<D>> +HWY_API V Max128(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); +} + +template <class D, class V = VFromD<D>> +HWY_API V Min128Upper(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); +} + +template <class D, class V = VFromD<D>> +HWY_API V Max128Upper(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); +} + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +#ifdef HWY_NATIVE_LEADING_ZERO_COUNT +#undef HWY_NATIVE_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_LEADING_ZERO_COUNT +#endif + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V LeadingZeroCount(V v) { + return V{vec_cntlz(v.raw)}; +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V HighestSetBitIndex(V v) { + const DFromV<decltype(v)> d; + using T = TFromD<decltype(d)>; + return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); +} + +#if HWY_PPC_HAVE_9 +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V TrailingZeroCount(V v) { +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 + return V{vec_vctz(v.raw)}; +#else + return V{vec_cnttz(v.raw)}; +#endif +} +#else +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V TrailingZeroCount(V v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + using TI = TFromD<decltype(di)>; + + const auto vi = BitCast(di, v); + const auto lowest_bit = And(vi, Neg(vi)); + constexpr TI kNumOfBitsInT{sizeof(TI) * 8}; + const auto bit_idx = HighestSetBitIndex(lowest_bit); + return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)), + Set(di, kNumOfBitsInT), bit_idx)); +} +#endif + +#undef HWY_PPC_HAVE_9 +#undef HWY_PPC_HAVE_10 + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/rvv-inl.h b/third_party/highway/hwy/ops/rvv-inl.h new file mode 100644 index 0000000000..8babc1c629 --- /dev/null +++ b/third_party/highway/hwy/ops/rvv-inl.h @@ -0,0 +1,4229 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// RISC-V V vectors (length not known at compile time). +// External include guard in highway.h - see comment there. + +#include <riscv_vector.h> + +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <class V> +struct DFromV_t {}; // specialized in macros +template <class V> +using DFromV = typename DFromV_t<RemoveConst<V>>::type; + +template <class V> +using TFromV = TFromD<DFromV<V>>; + +template <typename T, size_t N, int kPow2> +constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) { + // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower + // argument enables fractional LMUL < 1. Limit to 64 because that is the + // largest value for which vbool##_t are defined. + return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2)); +} + +// ================================================== MACROS + +// Generate specializations and function definitions using X macros. Although +// harder to read and debug, writing everything manually is too bulky. + +namespace detail { // for code folding + +// For all mask sizes MLEN: (1/Nth of a register, one bit per lane) +// The first three arguments are arbitrary SEW, LMUL, SHIFT such that +// SEW >> SHIFT = MLEN. +#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \ + X_MACRO(64, 0, 64, NAME, OP) \ + X_MACRO(32, 0, 32, NAME, OP) \ + X_MACRO(16, 0, 16, NAME, OP) \ + X_MACRO(8, 0, 8, NAME, OP) \ + X_MACRO(8, 1, 4, NAME, OP) \ + X_MACRO(8, 2, 2, NAME, OP) \ + X_MACRO(8, 3, 1, NAME, OP) + +// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows +// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or +// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix. +// +// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same +// reason, also pass the double-width and half SEW and LMUL (suffixed D and H, +// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8). +// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP + +// LMULS = _TRUNC: truncatable (not the smallest LMUL) +#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + +// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH. +#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + +// LMULS = _LE2: <= 2 +#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) + +#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) + +#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) + +// LMULS = _EXT: not the largest LMUL +#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) + +#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) + +// LMULS = _ALL (2^MinPow2() <= LMUL <= 8) +#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + +// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least +// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even +// though RISC-V LMUL must be at least SEW/64 (notice that this rules out +// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to +// one less than should be supported, with all other parameters (vector type +// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes() +// returns half of what it usually would. +// +// Notice that we can only add overloads whenever there is a D argument: those +// are unique with respect to non-virtual-LMUL overloads because their kPow2 +// template argument differs. Otherwise, there is no actual vuint64mf2_t, and +// defining another overload with the same LMUL would be an error. Thus we have +// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is +// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most +// functions that take a D. + +#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP) + +// ALL + VIRT +#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// LE2 + VIRT +#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// EXT + VIRT +#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// DEMOTE + VIRT +#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// SEW for unsigned: +#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP) +#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP) +#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP) +#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP) + +// SEW for signed: +#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP) +#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP) +#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP) +#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP) + +// SEW for float: +#if HWY_HAVE_FLOAT16 +#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP) +#else +#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) +#endif +#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP) +#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP) + +// Commonly used type/SEW groups: +#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) + +// For all combinations of SEW: +#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) + +// Commonly used type categories: +#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) + +// Assemble types for use in x-macros +#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t +#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT> +#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t +#define HWY_RVV_M(MLEN) vbool##MLEN##_t + +} // namespace detail + +// Until we have full intrinsic support for fractional LMUL, mixed-precision +// code can use LMUL 1..8 (adequate unless they need many registers). +#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <> \ + struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \ + using Lane = HWY_RVV_T(BASE, SEW); \ + using type = ScalableTag<Lane, SHIFT>; \ + }; + +HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL) +#undef HWY_SPECIALIZE + +// ------------------------------ Lanes + +// WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL! +#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ + constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \ + constexpr size_t kCap = detail::ScaleByPower(N, SHIFT); \ + /* If no cap, avoid generating a constant by using VLMAX. */ \ + size_t actual = N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \ + : __riscv_vsetvl_e##SEW##LMUL(kCap); \ + /* Common case of full vectors: avoid any extra instructions. */ \ + /* actual accounts for LMUL, so do not shift again. */ \ + if (d.Pow2() >= 0) return actual; \ + /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \ + /* vsetvl may or may not be correct, so do it ourselves. */ \ + if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) { \ + actual = detail::ScaleByPower(HWY_MIN(N, __riscv_vlenb() / (SEW / 8)), \ + SHIFT); \ + } \ + return actual; \ + } + +HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT) +#undef HWY_RVV_LANES + +template <size_t N, int kPow2> +HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) { + return Lanes(Simd<uint16_t, N, kPow2>()); +} + +// ------------------------------ Common x-macros + +// Last argument to most intrinsics. Use when the op has no d arg of its own, +// which means there is no user-specified cap. +#define HWY_RVV_AVL(SEW, SHIFT) \ + Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>()) + +// vector = f(vector), e.g. Not +#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// vector = f(vector, scalar), e.g. detail::AddS +#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// vector = f(vector, vector), e.g. Add +#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(a, b, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// mask = f(mask) +#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \ + return __riscv_vm##OP##_m_b##MLEN(m, ~0ull); \ + } + +// ================================================== INIT + +// ------------------------------ Set + +#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT) +HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT) +#undef HWY_RVV_SET + +// Treat bfloat16_t as uint16_t (using the previously defined Set overloads); +// required for Zero and VFromD. +template <size_t N, int kPow2> +decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d, + bfloat16_t arg) { + return Set(RebindToUnsigned<decltype(d)>(), arg.bits); +} + +template <class D> +using VFromD = decltype(Set(D(), TFromD<D>())); + +// ------------------------------ Zero + +template <class D> +HWY_API VFromD<D> Zero(D d) { + // Cast to support bfloat16_t. + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, Set(du, 0)); +} + +// ------------------------------ Undefined + +// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized +// by it gives unpredictable results. It should only be used for maskoff, so +// keep it internal. For the Highway op, just use Zero (single instruction). +namespace detail { +#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \ + } + +HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL) +#undef HWY_RVV_UNDEFINED +} // namespace detail + +template <class D> +HWY_API VFromD<D> Undefined(D d) { + return Zero(d); +} + +// ------------------------------ BitCast + +namespace detail { + +// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.) +#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \ + v); /* no AVL */ \ + } +HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC) +#undef HWY_RVV_TRUNC + +// Doubles LMUL to `d2` (the arg is only necessary for _VIRT). +#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMULD) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD( \ + v); /* no AVL */ \ + } +HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT) +#undef HWY_RVV_EXT + +// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is +// the same as the actual input type. +#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v; \ + } +HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT) +#undef HWY_RVV_EXT_VIRT + +// For BitCastToByte, the D arg is only to prevent duplicate definitions caused +// by _ALL_VIRT. + +// There is no reinterpret from u8 <-> u8, so just return. +#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <typename T, size_t N> \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ + vuint8##LMUL##_t v) { \ + return v; \ + } \ + template <size_t N> \ + HWY_API vuint8##LMUL##_t BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return v; \ + } + +// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two). +#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <typename T, size_t N> \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ + vint8##LMUL##_t v) { \ + return __riscv_vreinterpret_v_i8##LMUL##_u8##LMUL(v); \ + } \ + template <size_t N> \ + HWY_API vint8##LMUL##_t BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return __riscv_vreinterpret_v_u8##LMUL##_i8##LMUL(v); \ + } + +// Separate u/i because clang only provides signed <-> unsigned reinterpret for +// the same SEW. +#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <typename T, size_t N> \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \ + } \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \ + } + +// Signed/Float: first cast to/from unsigned +#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <typename T, size_t N> \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ + __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \ + } \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ + __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \ + } + +// Additional versions for virtual LMUL using LMULH for byte vectors. +#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <typename T, size_t N> \ + HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return detail::Trunc(__riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \ + } \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ + HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ + const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ + return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \ + } + +// Signed/Float: first cast to/from unsigned +#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <typename T, size_t N> \ + HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return detail::Trunc(__riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ + __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \ + } \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ + HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ + const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ + return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ + __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \ + } + +HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL) +HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL) +HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL) +HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL) +HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT) +HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) +HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) + +#undef HWY_RVV_CAST_U8 +#undef HWY_RVV_CAST_I8 +#undef HWY_RVV_CAST_U +#undef HWY_RVV_CAST_IF +#undef HWY_RVV_CAST_VIRT_U +#undef HWY_RVV_CAST_VIRT_IF + +template <size_t N, int kPow2> +HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte( + Simd<bfloat16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) { + return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v); +} + +} // namespace detail + +template <class D, class FromV> +HWY_API VFromD<D> BitCast(D d, FromV v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(d, v)); +} + +// ------------------------------ Iota + +namespace detail { + +#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \ + } + +// For i8 lanes, this may well wrap around. Unsigned only is less error-prone. +HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT) +#undef HWY_RVV_IOTA + +// Used by Expand. +#define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(mask, Lanes(d)); \ + } + +HWY_RVV_FOREACH_U(HWY_RVV_MASKED_IOTA, MaskedIota, iota_m, _ALL_VIRT) +#undef HWY_RVV_MASKED_IOTA + +} // namespace detail + +// ================================================== LOGICAL + +// ------------------------------ Not + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV, Not, not, _ALL) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V Not(const V v) { + using DF = DFromV<V>; + using DU = RebindToUnsigned<DF>; + return BitCast(DF(), Not(BitCast(DU(), v))); +} + +// ------------------------------ And + +// Non-vector version (ideally immediate) for use with Iota0 +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, And, and, _ALL) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V And(const V a, const V b) { + using DF = DFromV<V>; + using DU = RebindToUnsigned<DF>; + return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b))); +} + +// ------------------------------ Or + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Or, or, _ALL) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V Or(const V a, const V b) { + using DF = DFromV<V>; + using DU = RebindToUnsigned<DF>; + return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b))); +} + +// ------------------------------ Xor + +// Non-vector version (ideally immediate) for use with Iota0 +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Xor, xor, _ALL) + +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V Xor(const V a, const V b) { + using DF = DFromV<V>; + using DU = RebindToUnsigned<DF>; + return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b))); +} + +// ------------------------------ AndNot +template <class V> +HWY_API V AndNot(const V not_a, const V b) { + return And(Not(not_a), b); +} + +// ------------------------------ Xor3 +template <class V> +HWY_API V Xor3(V x1, V x2, V x3) { + return Xor(x1, Xor(x2, x3)); +} + +// ------------------------------ Or3 +template <class V> +HWY_API V Or3(V o1, V o2, V o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd +template <class V> +HWY_API V OrAnd(const V o, const V a1, const V a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ CopySign + +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL) + +template <class V> +HWY_API V CopySignToAbs(const V abs, const V sign) { + // RVV can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +} + +// ================================================== ARITHMETIC + +// Per-target flags to prevent generic_ops-inl.h defining Add etc. +#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS +#undef HWY_NATIVE_OPERATOR_REPLACEMENTS +#else +#define HWY_NATIVE_OPERATOR_REPLACEMENTS +#endif + +// ------------------------------ Add + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL) +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL) + +// ------------------------------ Sub +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL) + +// ------------------------------ SaturatedAdd + +#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB +#undef HWY_NATIVE_I32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB +#undef HWY_NATIVE_U32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB +#undef HWY_NATIVE_I64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I64_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB +#undef HWY_NATIVE_U64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_U64_SATURATED_ADDSUB +#endif + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL) + +// ------------------------------ SaturatedSub + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL) + +// ------------------------------ AverageRound + +// TODO(janwas): check vxrm rounding mode +HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, AverageRound, aaddu, _ALL) +HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, AverageRound, aaddu, _ALL) + +// ------------------------------ ShiftLeft[Same] + +// Intrinsics do not define .vi forms, so use .vx instead. +#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <int kBits> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ + return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL) + +// ------------------------------ ShiftRight[Same] + +HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL) + +#undef HWY_RVV_SHIFT + +// ------------------------------ SumsOf8 (ShiftRight, Add) +template <class VU8> +HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) { + const DFromV<VU8> du8; + const RepartitionToWide<decltype(du8)> du16; + const RepartitionToWide<decltype(du16)> du32; + const RepartitionToWide<decltype(du32)> du64; + using VU16 = VFromD<decltype(du16)>; + + const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); + const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF); + const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); + + const VU16 szz_FE_zz_BA_zz_76_zz_32 = + BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); + const VU16 sxx_FC_xx_B8_xx_74_xx_30 = + Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); + const VU16 szz_zz_xx_FC_zz_zz_xx_74 = + BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); + const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = + Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); + return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull); +} + +// ------------------------------ RotateRight +template <int kBits, class V> +HWY_API V RotateRight(const V v) { + constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +// ------------------------------ Shl +#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, bits, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL) + +#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ + const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, BitCast(du, bits), \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL) + +// ------------------------------ Shr + +HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL) + +#undef HWY_RVV_SHIFT_II +#undef HWY_RVV_SHIFT_VV + +// ------------------------------ Min + +namespace detail { + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MinS, minu_vx, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MinS, min_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MinS, fmin_vf, _ALL) + +} // namespace detail + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL) + +// ------------------------------ Max + +namespace detail { + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL) + +} // namespace detail + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL) + +// ------------------------------ Mul + +// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL) + +// ------------------------------ MulHigh + +// Only for internal use (Highway only promises MulHigh for 16-bit inputs). +// Used by MulEven; vwmul does not work for m8. +namespace detail { +HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) +HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) +HWY_RVV_FOREACH_U64(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) +HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) + +// ------------------------------ MulFixedPoint15 +HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulFixedPoint15, smul, _ALL) + +// ------------------------------ Div +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL) + +// ------------------------------ ApproximateReciprocal +HWY_RVV_FOREACH_F32(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL) + +// ------------------------------ Sqrt +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL) + +// ------------------------------ ApproximateReciprocalSqrt +HWY_RVV_FOREACH_F32(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL) + +// ------------------------------ MulAdd + +// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. +#ifdef HWY_NATIVE_INT_FMA +#undef HWY_NATIVE_INT_FMA +#else +#define HWY_NATIVE_INT_FMA +#endif + +// Note: op is still named vv, not vvv. +#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \ + HWY_RVV_V(BASE, SEW, LMUL) add) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_FMA, MulAdd, macc, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL) + +// ------------------------------ NegMulAdd +HWY_RVV_FOREACH_UI(HWY_RVV_FMA, NegMulAdd, nmsac, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL) + +// ------------------------------ MulSub +HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL) + +// ------------------------------ NegMulSub +HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL) + +#undef HWY_RVV_FMA + +// ================================================== COMPARE + +// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in +// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th +// of all bits; SEW=8 / LMUL=4 = half of all bits. + +// SFINAE for mapping Simd<> to MLEN (up to 64). +#define HWY_RVV_IF_MLEN_D(D, MLEN) \ + hwy::EnableIf<MLenFromD(D()) == MLEN>* = nullptr + +// Specialized for RVV instead of the generic test_util-inl.h implementation +// because more efficient, and helps implement MFromD. + +#define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \ + template <class D, HWY_RVV_IF_MLEN_D(D, MLEN)> \ + HWY_API HWY_RVV_M(MLEN) NAME(D d) { \ + return __riscv_vm##OP##_m_b##MLEN(Lanes(d)); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr) +#undef HWY_RVV_MASK_FALSE +#undef HWY_RVV_IF_MLEN_D + +template <class D> +using MFromD = decltype(MaskFalse(D())); + +// mask = f(vector, vector) +#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN( \ + a, b, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// mask = f(vector, scalar) +#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL##_b##MLEN( \ + a, b, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// ------------------------------ Eq +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL) + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL) +} // namespace detail + +// ------------------------------ Ne +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL) + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL) +} // namespace detail + +// ------------------------------ Lt +HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL) + +namespace detail { +HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL) +HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL) +} // namespace detail + +// ------------------------------ Le +HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL) + +#undef HWY_RVV_RETM_ARGVV +#undef HWY_RVV_RETM_ARGVS + +// ------------------------------ Gt/Ge + +template <class V> +HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) { + return Le(b, a); +} + +template <class V> +HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) { + return Lt(b, a); +} + +// ------------------------------ TestBit +template <class V> +HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) { + return detail::NeS(And(a, bit), 0); +} + +// ------------------------------ Not +// NOLINTNEXTLINE +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not ) + +// ------------------------------ And + +// mask = f(mask_a, mask_b) (note arg2,arg1 order!) +#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \ + return __riscv_vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, And, and) + +// ------------------------------ AndNot +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn) + +// ------------------------------ Or +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or) + +// ------------------------------ Xor +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor) + +// ------------------------------ ExclusiveNeither +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor) + +#undef HWY_RVV_RETM_ARGMM + +// ------------------------------ IfThenElse + +#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ + HWY_RVV_V(BASE, SEW, LMUL) no) { \ + return __riscv_v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL) + +#undef HWY_RVV_IF_THEN_ELSE + +// ------------------------------ IfThenElseZero +template <class M, class V> +HWY_API V IfThenElseZero(const M mask, const V yes) { + return IfThenElse(mask, yes, Zero(DFromV<V>())); +} + +// ------------------------------ IfThenZeroElse + +#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ + LMULH, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(no, 0, m, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL) + +#undef HWY_RVV_IF_THEN_ZERO_ELSE + +// ------------------------------ MaskFromVec +template <class V> +HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) { + return detail::NeS(v, 0); +} + +// ------------------------------ RebindMask +template <class D, typename MFrom> +HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) { + // No need to check lane size/LMUL are the same: if not, casting MFrom to + // MFromD<D> would fail. + return mask; +} + +// ------------------------------ VecFromMask + +// Returns mask ? ~0 : 0. No longer use sub.vx(Zero(), 1, mask) because per the +// default mask-agnostic policy, the result of inactive lanes may also be ~0. +#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \ + const RebindToSigned<decltype(d)> di; \ + using TI = TFromD<decltype(di)>; \ + return BitCast( \ + d, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, Lanes(d))); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT) + +#undef HWY_RVV_VEC_FROM_MASK + +template <class D, HWY_IF_FLOAT_D(D)> +HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) { + return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask)); +} + +// ------------------------------ IfVecThenElse (MaskFromVec) +template <class V> +HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ ZeroIfNegative +template <class V> +HWY_API V ZeroIfNegative(const V v) { + return IfThenZeroElse(detail::LtS(v, 0), v); +} + +// ------------------------------ BroadcastSignBit +template <class V> +HWY_API V BroadcastSignBit(const V v) { + return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v); +} + +// ------------------------------ IfNegativeThenElse (BroadcastSignBit) +template <class V> +HWY_API V IfNegativeThenElse(V v, V yes, V no) { + static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float"); + const DFromV<V> d; + const RebindToSigned<decltype(d)> di; + + MFromD<decltype(d)> m = + MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(m, yes, no); +} + +// ------------------------------ FindFirstTrue + +#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ + template <class D> \ + HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return __riscv_vfirst_m_b##MLEN(m, Lanes(d)); \ + } \ + template <class D> \ + HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return static_cast<size_t>(__riscv_vfirst_m_b##MLEN(m, Lanes(d))); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _) +#undef HWY_RVV_FIND_FIRST_TRUE + +// ------------------------------ AllFalse +template <class D> +HWY_API bool AllFalse(D d, MFromD<D> m) { + return FindFirstTrue(d, m) < 0; +} + +// ------------------------------ AllTrue + +#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ + template <class D> \ + HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return AllFalse(d, __riscv_vmnot_m_b##MLEN(m, Lanes(d))); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _) +#undef HWY_RVV_ALL_TRUE + +// ------------------------------ CountTrue + +#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ + template <class D> \ + HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return __riscv_vcpop_m_b##MLEN(m, Lanes(d)); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _) +#undef HWY_RVV_COUNT_TRUE + +// ================================================== MEMORY + +// ------------------------------ Load + +#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT) +#undef HWY_RVV_LOAD + +// There is no native BF16, treat as uint16_t. +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load( + Simd<bfloat16_t, N, kPow2> d, const bfloat16_t* HWY_RESTRICT p) { + return Load(RebindToUnsigned<decltype(d)>(), + reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)); +} + +template <size_t N, int kPow2> +HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v, + Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) { + Store(v, RebindToUnsigned<decltype(d)>(), + reinterpret_cast<uint16_t * HWY_RESTRICT>(p)); +} + +// ------------------------------ LoadU +template <class D> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + // RVV only requires element alignment, not vector alignment. + return Load(d, p); +} + +// ------------------------------ MaskedLoad + +#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu(m, Zero(d), p, \ + Lanes(d)); \ + } \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu(m, v, p, Lanes(d)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT) +#undef HWY_RVV_MASKED_LOAD + +// ------------------------------ Store + +#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT) +#undef HWY_RVV_STORE + +// ------------------------------ BlendedStore + +#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT) +#undef HWY_RVV_BLENDED_STORE + +namespace detail { + +#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \ + } +HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT) +#undef HWY_RVV_STOREN + +} // namespace detail + +// ------------------------------ StoreU +template <class V, class D> +HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) { + // RVV only requires element alignment, not vector alignment. + Store(v, d, p); +} + +// ------------------------------ Stream +template <class V, class D, typename T> +HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) { + Store(v, d, aligned); +} + +// ------------------------------ ScatterOffset + +#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ + HWY_RVV_V(int, SEW, LMUL) offset) { \ + const RebindToUnsigned<decltype(d)> du; \ + return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ + base, BitCast(du, offset), v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT) +#undef HWY_RVV_SCATTER + +// ------------------------------ ScatterIndex + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, + const VFromD<RebindToSigned<D>> index) { + return ScatterOffset(v, d, base, ShiftLeft<2>(index)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, + const VFromD<RebindToSigned<D>> index) { + return ScatterOffset(v, d, base, ShiftLeft<3>(index)); +} + +// ------------------------------ GatherOffset + +#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ + HWY_RVV_V(int, SEW, LMUL) offset) { \ + const RebindToUnsigned<decltype(d)> du; \ + return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ + base, BitCast(du, offset), Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT) +#undef HWY_RVV_GATHER + +// ------------------------------ GatherIndex + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base, + const VFromD<RebindToSigned<D>> index) { + return GatherOffset(d, base, ShiftLeft<2>(index)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base, + const VFromD<RebindToSigned<D>> index) { + return GatherOffset(d, base, ShiftLeft<3>(index)); +} + +// ================================================== CONVERT + +// ------------------------------ PromoteTo + +// SEW is for the input so we can use F16 (no-op if not supported). +#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ + HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##CHAR##SEWD##LMULD(v, Lanes(d)); \ + } + +HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, _EXT_VIRT) +HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, _EXT_VIRT) +#undef HWY_RVV_PROMOTE + +// The above X-macro cannot handle 4x promotion nor type switching. +// TODO(janwas): use BASE2 arg to allow the latter. +#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \ + SHIFT, ADD) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, BITS, LMUL) \ + PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \ + HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \ + return __riscv_v##OP##CHAR##BITS##LMUL(v, Lanes(d)); \ + } + +#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1) + +#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2) + +#define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \ + HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) + +#define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf8, -3, 3) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf4, -2, 3) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, mf2, -1, 3) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m1, 0, 3) + +HWY_RVV_PROMOTE_X8(zext_vf8_, uint, u, 64, uint, 8) +HWY_RVV_PROMOTE_X8(sext_vf8_, int, i, 64, int, 8) + +HWY_RVV_PROMOTE_X4_FROM_U8(zext_vf4_, uint, u, 32, uint, 8) +HWY_RVV_PROMOTE_X4_FROM_U8(sext_vf4_, int, i, 32, int, 8) +HWY_RVV_PROMOTE_X4(zext_vf4_, uint, u, 64, uint, 16) +HWY_RVV_PROMOTE_X4(sext_vf4_, int, i, 64, int, 16) + +// i32 to f64 +HWY_RVV_PROMOTE_X2(fwcvt_f_x_v_, float, f, 64, int, 32) + +#undef HWY_RVV_PROMOTE_X8 +#undef HWY_RVV_PROMOTE_X4_FROM_U8 +#undef HWY_RVV_PROMOTE_X4 +#undef HWY_RVV_PROMOTE_X2 +#undef HWY_RVV_PROMOTE + +// I16->I64 or U16->U64 PromoteTo with virtual LMUL +template <size_t N> +HWY_API auto PromoteTo(Simd<int64_t, N, -1> d, + VFromD<Rebind<int16_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return PromoteTo(ScalableTag<int64_t>(), v); +} + +template <size_t N> +HWY_API auto PromoteTo(Simd<uint64_t, N, -1> d, + VFromD<Rebind<uint16_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return PromoteTo(ScalableTag<uint64_t>(), v); +} + +// Unsigned to signed: cast for unsigned promote. +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<int16_t, N, kPow2> d, + VFromD<Rebind<uint8_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); +} + +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<int32_t, N, kPow2> d, + VFromD<Rebind<uint8_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); +} + +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<int32_t, N, kPow2> d, + VFromD<Rebind<uint16_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); +} + +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d, + VFromD<Rebind<uint32_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); +} + +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d, + VFromD<Rebind<uint16_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); +} + +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d, + VFromD<Rebind<uint8_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); +} + +template <size_t N, int kPow2> +HWY_API auto PromoteTo(Simd<float32_t, N, kPow2> d, + VFromD<Rebind<bfloat16_t, decltype(d)>> v) + -> VFromD<decltype(d)> { + const RebindToSigned<decltype(d)> di32; + const Rebind<uint16_t, decltype(d)> du16; + return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ------------------------------ DemoteTo U + +// SEW is for the source so we can use _DEMOTE_VIRT. +#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ + HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##CHAR##SEWH##LMULH(v, 0, Lanes(d)); \ + } + +// Unsigned -> unsigned +HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) +HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) +HWY_RVV_FOREACH_U64(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) + +// SEW is for the source so we can use _DEMOTE_VIRT. +#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \ + HWY_RVV_D(uint, SEWH, N, SHIFT - 1) dn, HWY_RVV_V(int, SEW, LMUL) v) { \ + const HWY_RVV_D(uint, SEW, N, SHIFT) du; \ + /* First clamp negative numbers to zero to match x86 packus. */ \ + return DemoteTo(dn, BitCast(du, detail::MaxS(v, 0))); \ + } +HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) +HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) +HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) +#undef HWY_RVV_DEMOTE_I_TO_U + +template <size_t N> +HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) { + return __riscv_vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) { + return __riscv_vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) { + return __riscv_vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) { + return __riscv_vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) { + return __riscv_vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, + Lanes(d)); +} + +template <size_t N> +HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vuint32mf2_t v) { + return __riscv_vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vuint32m1_t v) { + return __riscv_vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vuint32m2_t v) { + return __riscv_vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vuint32m4_t v) { + return __riscv_vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, + Lanes(d)); +} +template <size_t N> +HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) { + return __riscv_vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, + Lanes(d)); +} + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint8_t, N, kPow2>> DemoteTo( + Simd<uint8_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 3>> v) { + return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v)); +} + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint8_t, N, kPow2>> DemoteTo( + Simd<uint8_t, N, kPow2> d, VFromD<Simd<uint64_t, N, kPow2 + 3>> v) { + return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v)); +} + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo( + Simd<uint16_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 2>> v) { + return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v)); +} + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo( + Simd<uint16_t, N, kPow2> d, VFromD<Simd<uint64_t, N, kPow2 + 2>> v) { + return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v)); +} + +HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) { + const size_t avl = Lanes(ScalableTag<uint8_t, -3>()); + return __riscv_vnclipu_wx_u8mf8(__riscv_vnclipu_wx_u16mf4(v, 0, avl), 0, avl); +} +HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) { + const size_t avl = Lanes(ScalableTag<uint8_t, -2>()); + return __riscv_vnclipu_wx_u8mf4(__riscv_vnclipu_wx_u16mf2(v, 0, avl), 0, avl); +} +HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) { + const size_t avl = Lanes(ScalableTag<uint8_t, -1>()); + return __riscv_vnclipu_wx_u8mf2(__riscv_vnclipu_wx_u16m1(v, 0, avl), 0, avl); +} +HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) { + const size_t avl = Lanes(ScalableTag<uint8_t, 0>()); + return __riscv_vnclipu_wx_u8m1(__riscv_vnclipu_wx_u16m2(v, 0, avl), 0, avl); +} +HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) { + const size_t avl = Lanes(ScalableTag<uint8_t, 1>()); + return __riscv_vnclipu_wx_u8m2(__riscv_vnclipu_wx_u16m4(v, 0, avl), 0, avl); +} + +// ------------------------------ Truncations + +template <size_t N> +HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d, + const VFromD<Simd<uint64_t, N, 0>> v) { + const size_t avl = Lanes(d); + const vuint64m1_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(v1, 0, avl); + const vuint16mf4_t v3 = __riscv_vnclipu_wx_u16mf4(v2, 0, avl); + return __riscv_vnclipu_wx_u8mf8(v3, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d, + const VFromD<Simd<uint64_t, N, 1>> v) { + const size_t avl = Lanes(d); + const vuint64m2_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(v1, 0, avl); + const vuint16mf2_t v3 = __riscv_vnclipu_wx_u16mf2(v2, 0, avl); + return __riscv_vnclipu_wx_u8mf4(v3, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d, + const VFromD<Simd<uint64_t, N, 2>> v) { + const size_t avl = Lanes(d); + const vuint64m4_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(v1, 0, avl); + const vuint16m1_t v3 = __riscv_vnclipu_wx_u16m1(v2, 0, avl); + return __riscv_vnclipu_wx_u8mf2(v3, 0, avl); +} + +template <size_t N> +HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d, + const VFromD<Simd<uint64_t, N, 3>> v) { + const size_t avl = Lanes(d); + const vuint64m8_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(v1, 0, avl); + const vuint16m2_t v3 = __riscv_vnclipu_wx_u16m2(v2, 0, avl); + return __riscv_vnclipu_wx_u8m1(v3, 0, avl); +} + +template <size_t N> +HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d, + const VFromD<Simd<uint64_t, N, 0>> v) { + const size_t avl = Lanes(d); + const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl); + const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(v1, 0, avl); + return __riscv_vnclipu_wx_u16mf4(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d, + const VFromD<Simd<uint64_t, N, 1>> v) { + const size_t avl = Lanes(d); + const vuint64m2_t v1 = __riscv_vand(v, 0xFFFF, avl); + const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(v1, 0, avl); + return __riscv_vnclipu_wx_u16mf2(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d, + const VFromD<Simd<uint64_t, N, 2>> v) { + const size_t avl = Lanes(d); + const vuint64m4_t v1 = __riscv_vand(v, 0xFFFF, avl); + const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(v1, 0, avl); + return __riscv_vnclipu_wx_u16m1(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d, + const VFromD<Simd<uint64_t, N, 3>> v) { + const size_t avl = Lanes(d); + const vuint64m8_t v1 = __riscv_vand(v, 0xFFFF, avl); + const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(v1, 0, avl); + return __riscv_vnclipu_wx_u16m2(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -1> d, + const VFromD<Simd<uint64_t, N, 0>> v) { + const size_t avl = Lanes(d); + const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); + return __riscv_vnclipu_wx_u32mf2(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint32m1_t TruncateTo(Simd<uint32_t, N, 0> d, + const VFromD<Simd<uint64_t, N, 1>> v) { + const size_t avl = Lanes(d); + const vuint64m2_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); + return __riscv_vnclipu_wx_u32m1(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint32m2_t TruncateTo(Simd<uint32_t, N, 1> d, + const VFromD<Simd<uint64_t, N, 2>> v) { + const size_t avl = Lanes(d); + const vuint64m4_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); + return __riscv_vnclipu_wx_u32m2(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint32m4_t TruncateTo(Simd<uint32_t, N, 2> d, + const VFromD<Simd<uint64_t, N, 3>> v) { + const size_t avl = Lanes(d); + const vuint64m8_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); + return __riscv_vnclipu_wx_u32m4(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d, + const VFromD<Simd<uint32_t, N, -1>> v) { + const size_t avl = Lanes(d); + const vuint32mf2_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint16mf4_t v2 = __riscv_vnclipu_wx_u16mf4(v1, 0, avl); + return __riscv_vnclipu_wx_u8mf8(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d, + const VFromD<Simd<uint32_t, N, 0>> v) { + const size_t avl = Lanes(d); + const vuint32m1_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint16mf2_t v2 = __riscv_vnclipu_wx_u16mf2(v1, 0, avl); + return __riscv_vnclipu_wx_u8mf4(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d, + const VFromD<Simd<uint32_t, N, 1>> v) { + const size_t avl = Lanes(d); + const vuint32m2_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint16m1_t v2 = __riscv_vnclipu_wx_u16m1(v1, 0, avl); + return __riscv_vnclipu_wx_u8mf2(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d, + const VFromD<Simd<uint32_t, N, 2>> v) { + const size_t avl = Lanes(d); + const vuint32m4_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint16m2_t v2 = __riscv_vnclipu_wx_u16m2(v1, 0, avl); + return __riscv_vnclipu_wx_u8m1(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d, + const VFromD<Simd<uint32_t, N, 3>> v) { + const size_t avl = Lanes(d); + const vuint32m8_t v1 = __riscv_vand(v, 0xFF, avl); + const vuint16m4_t v2 = __riscv_vnclipu_wx_u16m4(v1, 0, avl); + return __riscv_vnclipu_wx_u8m2(v2, 0, avl); +} + +template <size_t N> +HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d, + const VFromD<Simd<uint32_t, N, -1>> v) { + const size_t avl = Lanes(d); + const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl); + return __riscv_vnclipu_wx_u16mf4(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d, + const VFromD<Simd<uint32_t, N, 0>> v) { + const size_t avl = Lanes(d); + const vuint32m1_t v1 = __riscv_vand(v, 0xFFFF, avl); + return __riscv_vnclipu_wx_u16mf2(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d, + const VFromD<Simd<uint32_t, N, 1>> v) { + const size_t avl = Lanes(d); + const vuint32m2_t v1 = __riscv_vand(v, 0xFFFF, avl); + return __riscv_vnclipu_wx_u16m1(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d, + const VFromD<Simd<uint32_t, N, 2>> v) { + const size_t avl = Lanes(d); + const vuint32m4_t v1 = __riscv_vand(v, 0xFFFF, avl); + return __riscv_vnclipu_wx_u16m2(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint16m4_t TruncateTo(Simd<uint16_t, N, 2> d, + const VFromD<Simd<uint32_t, N, 3>> v) { + const size_t avl = Lanes(d); + const vuint32m8_t v1 = __riscv_vand(v, 0xFFFF, avl); + return __riscv_vnclipu_wx_u16m4(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d, + const VFromD<Simd<uint16_t, N, -2>> v) { + const size_t avl = Lanes(d); + const vuint16mf4_t v1 = __riscv_vand(v, 0xFF, avl); + return __riscv_vnclipu_wx_u8mf8(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d, + const VFromD<Simd<uint16_t, N, -1>> v) { + const size_t avl = Lanes(d); + const vuint16mf2_t v1 = __riscv_vand(v, 0xFF, avl); + return __riscv_vnclipu_wx_u8mf4(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d, + const VFromD<Simd<uint16_t, N, 0>> v) { + const size_t avl = Lanes(d); + const vuint16m1_t v1 = __riscv_vand(v, 0xFF, avl); + return __riscv_vnclipu_wx_u8mf2(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d, + const VFromD<Simd<uint16_t, N, 1>> v) { + const size_t avl = Lanes(d); + const vuint16m2_t v1 = __riscv_vand(v, 0xFF, avl); + return __riscv_vnclipu_wx_u8m1(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d, + const VFromD<Simd<uint16_t, N, 2>> v) { + const size_t avl = Lanes(d); + const vuint16m4_t v1 = __riscv_vand(v, 0xFF, avl); + return __riscv_vnclipu_wx_u8m2(v1, 0, avl); +} + +template <size_t N> +HWY_API vuint8m4_t TruncateTo(Simd<uint8_t, N, 2> d, + const VFromD<Simd<uint16_t, N, 3>> v) { + const size_t avl = Lanes(d); + const vuint16m8_t v1 = __riscv_vand(v, 0xFF, avl); + return __riscv_vnclipu_wx_u8m4(v1, 0, avl); +} + +// ------------------------------ DemoteTo I + +HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) +HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) +HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) + +template <size_t N> +HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) { + return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v)); +} +template <size_t N> +HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) { + return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v)); +} +template <size_t N> +HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) { + return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v)); +} +template <size_t N> +HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) { + return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v)); +} +template <size_t N> +HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) { + return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v)); +} + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<int8_t, N, kPow2>> DemoteTo( + Simd<int8_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 3>> v) { + return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 2>(), v)); +} + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<int16_t, N, kPow2>> DemoteTo( + Simd<int16_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 2>> v) { + return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 1>(), v)); +} + +#undef HWY_RVV_DEMOTE + +// ------------------------------ DemoteTo F + +// SEW is for the source so we can use _DEMOTE_VIRT. +#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ + HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##SEWH##LMULH(v, Lanes(d)); \ + } + +#if HWY_HAVE_FLOAT16 +HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT) +#endif +HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT) +#undef HWY_RVV_DEMOTE_F + +// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F. +template <size_t N> +HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) { + return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); +} +template <size_t N> +HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) { + return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); +} +template <size_t N> +HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) { + return __riscv_vfncvt_rtz_x_f_w_i32m1(v, Lanes(d)); +} +template <size_t N> +HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) { + return __riscv_vfncvt_rtz_x_f_w_i32m2(v, Lanes(d)); +} +template <size_t N> +HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) { + return __riscv_vfncvt_rtz_x_f_w_i32m4(v, Lanes(d)); +} + +// SEW is for the source so we can use _DEMOTE_VIRT. +#define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ + LMULH, SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ + HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \ + } +namespace detail { +HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_TO_SHR_16, DemoteToShr16, nclipu_wx_, + _DEMOTE_VIRT) +} +#undef HWY_RVV_DEMOTE_TO_SHR_16 + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo( + Simd<bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) { + const RebindToUnsigned<decltype(d)> du16; + const Rebind<uint32_t, decltype(d)> du32; + return detail::DemoteToShr16(du16, BitCast(du32, v)); +} + +// ------------------------------ ConvertTo F + +#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \ + return __riscv_vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \ + } \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) { \ + return __riscv_vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \ + } \ + /* Truncates (rounds toward zero). */ \ + template <size_t N> \ + HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \ + } \ +// API only requires f32 but we provide f64 for internal use. +HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT) +#undef HWY_RVV_CONVERT + +// Uses default rounding mode. Must be separate because there is no D arg. +#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ + } +HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL) +#undef HWY_RVV_NEAREST + +// ================================================== COMBINE + +namespace detail { + +// For x86-compatible behaviour mandated by Highway API: TableLookupBytes +// offsets are implicitly relative to the start of their 128-bit block. +template <typename T, size_t N, int kPow2> +size_t LanesPerBlock(Simd<T, N, kPow2> d) { + // kMinVecBytes is the minimum size of VFromD<decltype(d)> in bytes + constexpr size_t kMinVecBytes = + ScaleByPower(16, HWY_MAX(HWY_MIN(kPow2, 3), -3)); + // kMinVecLanes is the minimum number of lanes in VFromD<decltype(d)> + constexpr size_t kMinVecLanes = (kMinVecBytes + sizeof(T) - 1) / sizeof(T); + // kMaxLpb is the maximum number of lanes per block + constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), MaxLanes(d)); + + // If kMaxLpb <= kMinVecLanes is true, then kMaxLpb <= Lanes(d) is true + if (kMaxLpb <= kMinVecLanes) return kMaxLpb; + + // Fractional LMUL: Lanes(d) may be smaller than kMaxLpb, so honor that. + const size_t lanes_per_vec = Lanes(d); + return HWY_MIN(lanes_per_vec, kMaxLpb); +} + +template <class D, class V> +HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { + using T = MakeUnsigned<TFromD<D>>; + return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1))); +} + +template <size_t kLanes, class D> +HWY_INLINE MFromD<D> FirstNPerBlock(D /* tag */) { + const RebindToUnsigned<D> du; + const RebindToSigned<D> di; + using TU = TFromD<decltype(du)>; + const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1)); + return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes)); +} + +#define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \ + size_t lanes) { \ + return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +#define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) src, size_t lanes) { \ + return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(src, lanes, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_SLIDE_UP, SlideUp, slideup, _ALL) +HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL) + +#undef HWY_RVV_SLIDE_UP +#undef HWY_RVV_SLIDE_DOWN + +} // namespace detail + +// ------------------------------ ConcatUpperLower +template <class D, class V> +HWY_API V ConcatUpperLower(D d, const V hi, const V lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ ConcatLowerLower +template <class D, class V> +HWY_API V ConcatLowerLower(D d, const V hi, const V lo) { + return detail::SlideUp(lo, hi, Lanes(d) / 2); +} + +// ------------------------------ ConcatUpperUpper +template <class D, class V> +HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) { + // Move upper half into lower + const auto lo_down = detail::SlideDown(lo, Lanes(d) / 2); + return ConcatUpperLower(d, hi, lo_down); +} + +// ------------------------------ ConcatLowerUpper +template <class D, class V> +HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { + // Move half of both inputs to the other half + const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2); + const auto lo_down = detail::SlideDown(lo, Lanes(d) / 2); + return ConcatUpperLower(d, hi_up, lo_down); +} + +// ------------------------------ Combine +template <class D2, class V> +HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) { + return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi), + Lanes(d2) / 2); +} + +// ------------------------------ ZeroExtendVector +template <class D2, class V> +HWY_API VFromD<D2> ZeroExtendVector(D2 d2, const V lo) { + return Combine(d2, Xor(lo, lo), lo); +} + +// ------------------------------ Lower/UpperHalf + +namespace detail { + +// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note +// that SEW = sizeof(T)*8 and LMUL = 1 << d.Pow2(). Add 3 to Pow2 to avoid +// negative shift counts. +template <class D> +constexpr bool IsSupportedLMUL(D d) { + return (size_t{1} << (d.Pow2() + 3)) >= sizeof(TFromD<D>); +} + +} // namespace detail + +// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL. +template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr> +HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) { + return detail::Trunc(v); +} + +// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and +// the hardware may set "vill" if we attempt such an LMUL. However, the V +// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it +// still makes sense to have half of an SEW=64 vector. We instead just return +// the vector, and rely on the kPow2 in DH to halve the return value of Lanes(). +template <class DH, class V, + hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr> +HWY_API V LowerHalf(const DH /* tag */, const V v) { + return v; +} + +// Same, but without D arg +template <class V> +HWY_API VFromD<Half<DFromV<V>>> LowerHalf(const V v) { + return LowerHalf(Half<DFromV<V>>(), v); +} + +template <class DH> +HWY_API VFromD<DH> UpperHalf(const DH d2, const VFromD<Twice<DH>> v) { + return LowerHalf(d2, detail::SlideDown(v, Lanes(d2))); +} + +// ================================================== SWIZZLE + +namespace detail { +// Special instruction for 1 lane is presumably faster? +#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL) +HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL) +#undef HWY_RVV_SLIDE1 +} // namespace detail + +// ------------------------------ GetLane + +#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetLane, fmv_f, _ALL) +#undef HWY_RVV_GET_LANE + +// ------------------------------ ExtractLane +template <class V> +HWY_API TFromV<V> ExtractLane(const V v, size_t i) { + return GetLane(detail::SlideDown(v, i)); +} + +// ------------------------------ InsertLane + +template <class V, HWY_IF_NOT_T_SIZE_V(V, 1)> +HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only + using TU = TFromD<decltype(du)>; + const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i)); + return IfThenElse(RebindMask(d, is_i), Set(d, t), v); +} + +namespace detail { +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof) +} // namespace detail + +// For 8-bit lanes, Iota0 might overflow. +template <class V, HWY_IF_T_SIZE_V(V, 1)> +HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) { + const DFromV<V> d; + const auto zero = Zero(d); + const auto one = Set(d, 1); + const auto ge_i = Eq(detail::SlideUp(zero, one, i), one); + const auto is_i = detail::SetOnlyFirst(ge_i); + return IfThenElse(RebindMask(d, is_i), Set(d, t), v); +} + +// ------------------------------ OddEven + +namespace detail { + +// Faster version using a wide constant instead of Iota0 + AndS. +template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> IsEven(D d) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<decltype(du)> duw; + return RebindMask(d, detail::NeS(BitCast(du, Set(duw, 1)), 0u)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> IsEven(D d) { + const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only + return detail::EqS(detail::AndS(detail::Iota0(du), 1), 0); +} + +// Also provide the negated form because there is no native CompressNot. +template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> IsOdd(D d) { + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<decltype(du)> duw; + return RebindMask(d, detail::EqS(BitCast(du, Set(duw, 1)), 0u)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> IsOdd(D d) { + const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only + return detail::NeS(detail::AndS(detail::Iota0(du), 1), 0); +} + +} // namespace detail + +template <class V> +HWY_API V OddEven(const V a, const V b) { + return IfThenElse(detail::IsEven(DFromV<V>()), b, a); +} + +// ------------------------------ DupEven (OddEven) +template <class V> +HWY_API V DupEven(const V v) { + const V up = detail::Slide1Up(v); + return OddEven(up, v); +} + +// ------------------------------ DupOdd (OddEven) +template <class V> +HWY_API V DupOdd(const V v) { + const V down = detail::Slide1Down(v); + return OddEven(v, down); +} + +// ------------------------------ OddEvenBlocks +template <class V> +HWY_API V OddEvenBlocks(const V a, const V b) { + const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only + constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>)); + const auto idx_block = ShiftRight<kShift>(detail::Iota0(du)); + const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0); + return IfThenElse(is_even, b, a); +} + +// ------------------------------ SwapAdjacentBlocks +template <class V> +HWY_API V SwapAdjacentBlocks(const V v) { + const DFromV<V> d; + const size_t lpb = detail::LanesPerBlock(d); + const V down = detail::SlideDown(v, lpb); + const V up = detail::SlideUp(v, v, lpb); + return OddEvenBlocks(up, down); +} + +// ------------------------------ TableLookupLanes + +template <class D, class VI> +HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) { + static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane"); + const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d. + const auto indices = BitCast(du, vec); +#if HWY_IS_DEBUG_BUILD + using TU = TFromD<decltype(du)>; + const size_t twice_num_of_lanes = Lanes(d) * 2; + HWY_DASSERT(AllTrue( + du, Eq(indices, + detail::AndS(indices, static_cast<TU>(twice_num_of_lanes - 1))))); +#endif + return indices; +} + +template <class D, typename TI> +HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) { + static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); + return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx)); +} + +// TODO(janwas): avoid using this for 8-bit; wrap in detail namespace. +// For large 8-bit vectors, index overflow will lead to incorrect results. +// Reverse already uses TableLookupLanes16 to prevent this. +#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL) +#undef HWY_RVV_TABLE + +namespace detail { + +// Used by I8/U8 Reverse +#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI08(HWY_RVV_TABLE16, TableLookupLanes16, rgatherei16, _EXT) +#undef HWY_RVV_TABLE16 + +// Used by Expand. +#define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \ + HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_MASKED_TABLE, MaskedTableLookupLanes, rgather, _ALL) +#undef HWY_RVV_MASKED_TABLE + +#define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ + LMULH, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \ + HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI08(HWY_RVV_MASKED_TABLE16, MaskedTableLookupLanes16, + rgatherei16, _EXT) +#undef HWY_RVV_MASKED_TABLE16 + +} // namespace detail + +// ------------------------------ Reverse (TableLookupLanes) +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_LE_D(D, 2)> +HWY_API VFromD<D> Reverse(D d, VFromD<D> v) { + const Rebind<uint16_t, decltype(d)> du16; + const size_t N = Lanes(d); + const auto idx = + detail::ReverseSubS(detail::Iota0(du16), static_cast<uint16_t>(N - 1)); + return detail::TableLookupLanes16(v, idx); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_GT_D(D, 2)> +HWY_API VFromD<D> Reverse(D d, VFromD<D> v) { + const Half<decltype(d)> dh; + const Rebind<uint16_t, decltype(dh)> du16; + const size_t half_n = Lanes(dh); + const auto idx = detail::ReverseSubS(detail::Iota0(du16), + static_cast<uint16_t>(half_n - 1)); + const auto reversed_lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx); + const auto reversed_hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx); + return Combine(d, reversed_lo, reversed_hi); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) { + const RebindToUnsigned<D> du; + using TU = TFromD<decltype(du)>; + const size_t N = Lanes(du); + const auto idx = + detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1)); + return TableLookupLanes(v, idx); +} + +// ------------------------------ Reverse2 (RotateRight, OddEven) + +// Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +// Shifting and adding requires fewer instructions than blending, but casting to +// u32 only works for LMUL in [1/2, 8]. + +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_GT_D(D, -3)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const Repartition<uint16_t, D> du16; + return BitCast(d, RotateRight<8>(BitCast(du16, v))); +} +// For LMUL < 1/4, we can extend and then truncate. +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_LE_D(D, -3)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const Twice<decltype(d)> d2; + const Repartition<uint16_t, decltype(d2)> du16; + const auto vx = detail::Ext(d2, v); + const auto rx = BitCast(d2, RotateRight<8>(BitCast(du16, vx))); + return detail::Trunc(rx); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_POW2_GT_D(D, -2)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const Repartition<uint32_t, D> du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} +// For LMUL < 1/2, we can extend and then truncate. +template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_POW2_LE_D(D, -2)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const Twice<decltype(d)> d2; + const Twice<decltype(d2)> d4; + const Repartition<uint32_t, decltype(d4)> du32; + const auto vx = detail::Ext(d4, detail::Ext(d2, v)); + const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx))); + return detail::Trunc(detail::Trunc(rx)); +} + +// Shifting and adding requires fewer instructions than blending, but casting to +// u64 does not work for LMUL < 1. +template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_POW2_GT_D(D, -1)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const Repartition<uint64_t, decltype(d)> du64; + return BitCast(d, RotateRight<32>(BitCast(du64, v))); +} + +// For fractions, we can extend and then truncate. +template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_POW2_LE_D(D, -1)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const Twice<decltype(d)> d2; + const Twice<decltype(d2)> d4; + const Repartition<uint64_t, decltype(d4)> du64; + const auto vx = detail::Ext(d4, detail::Ext(d2, v)); + const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx))); + return detail::Trunc(detail::Trunc(rx)); +} + +template <class D, class V = VFromD<D>, HWY_IF_T_SIZE_D(D, 8)> +HWY_API V Reverse2(D /* tag */, const V v) { + const V up = detail::Slide1Up(v); + const V down = detail::Slide1Down(v); + return OddEven(up, down); +} + +// ------------------------------ Reverse4 (TableLookupLanes) + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { + const Repartition<uint16_t, D> du16; + return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v)))); +} + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { + const RebindToUnsigned<D> du; + const auto idx = detail::XorS(detail::Iota0(du), 3); + return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); +} + +// ------------------------------ Reverse8 (TableLookupLanes) + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { + const Repartition<uint32_t, D> du32; + return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v)))); +} + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { + const RebindToUnsigned<D> du; + const auto idx = detail::XorS(detail::Iota0(du), 7); + return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); +} + +// ------------------------------ ReverseBlocks (Reverse, Shuffle01) +template <class D, class V = VFromD<D>> +HWY_API V ReverseBlocks(D d, V v) { + const Repartition<uint64_t, D> du64; + const size_t N = Lanes(du64); + const auto rev = + detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1)); + // Swap lo/hi u64 within each block + const auto idx = detail::XorS(rev, 1); + return BitCast(d, TableLookupLanes(BitCast(du64, v), idx)); +} + +// ------------------------------ Compress + +// RVV supports all lane types natively. +#ifdef HWY_NATIVE_COMPRESS8 +#undef HWY_NATIVE_COMPRESS8 +#else +#define HWY_NATIVE_COMPRESS8 +#endif + +template <typename T> +struct CompressIsPartition { + enum { value = 0 }; +}; + +#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ + return __riscv_v##OP##_vm_##CHAR##SEW##LMUL(v, mask, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_COMPRESS, Compress, compress, _ALL) +#undef HWY_RVV_COMPRESS + +// ------------------------------ Expand + +#ifdef HWY_NATIVE_EXPAND +#undef HWY_NATIVE_EXPAND +#else +#define HWY_NATIVE_EXPAND +#endif + +// >= 2-byte lanes: idx lanes will not overflow. +template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 1)> +HWY_API V Expand(V v, const M mask) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; + const auto idx = detail::MaskedIota(du, RebindMask(du, mask)); + const V zero = Zero(d); + return detail::MaskedTableLookupLanes(mask, zero, v, idx); +} + +// 1-byte lanes, LMUL < 8: promote idx to u16. +template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>, + HWY_IF_POW2_LE_D(D, 2)> +HWY_API V Expand(V v, const M mask) { + const D d; + const Rebind<uint16_t, decltype(d)> du16; + const auto idx = detail::MaskedIota(du16, RebindMask(du16, mask)); + const V zero = Zero(d); + return detail::MaskedTableLookupLanes16(mask, zero, v, idx); +} + +// 1-byte lanes, max LMUL: unroll 2x. +template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>, + HWY_IF_POW2_GT_D(DFromV<V>, 2)> +HWY_API V Expand(V v, const M mask) { + const D d; + const Half<D> dh; + const auto v0 = LowerHalf(dh, v); + // TODO(janwas): skip vec<->mask if we can cast masks. + const V vmask = VecFromMask(d, mask); + const auto m0 = MaskFromVec(LowerHalf(dh, vmask)); + + // Cannot just use UpperHalf, must shift by the number of inputs consumed. + const size_t count = CountTrue(dh, m0); + const auto v1 = detail::Trunc(detail::SlideDown(v, count)); + const auto m1 = MaskFromVec(UpperHalf(dh, vmask)); + return Combine(d, Expand(v1, m1), Expand(v0, m0)); +} + +// ------------------------------ LoadExpand +template <class D> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + return Expand(LoadU(d, unaligned), mask); +} + +// ------------------------------ CompressNot +template <class V, class M> +HWY_API V CompressNot(V v, const M mask) { + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +template <class V, class M> +HWY_API V CompressBlocksNot(V v, const M mask) { + return CompressNot(v, mask); +} + +// ------------------------------ CompressStore +template <class V, class M, class D> +HWY_API size_t CompressStore(const V v, const M mask, const D d, + TFromD<D>* HWY_RESTRICT unaligned) { + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ CompressBlendedStore +template <class V, class M, class D> +HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const size_t count = CountTrue(d, mask); + detail::StoreN(count, Compress(v, mask), d, unaligned); + return count; +} + +// ================================================== COMPARE (2) + +// ------------------------------ FindLastTrue + +template <class D> +HWY_API intptr_t FindLastTrue(D d, MFromD<D> m) { + const RebindToSigned<decltype(d)> di; + const intptr_t fft_rev_idx = + FindFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m)))); + return (fft_rev_idx >= 0) + ? (static_cast<intptr_t>(Lanes(d) - 1) - fft_rev_idx) + : intptr_t{-1}; +} + +template <class D> +HWY_API size_t FindKnownLastTrue(D d, MFromD<D> m) { + const RebindToSigned<decltype(d)> di; + const size_t fft_rev_idx = + FindKnownFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m)))); + return Lanes(d) - 1 - fft_rev_idx; +} + +// ------------------------------ ConcatOdd (Compress) + +namespace detail { + +#define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t kShift> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEWD, LMULD) v) { \ + return __riscv_v##OP##_wx_##CHAR##SEW##LMUL(v, kShift, \ + HWY_RVV_AVL(SEWD, SHIFT + 1)); \ + } + +HWY_RVV_FOREACH_U08(HWY_RVV_NARROW, Narrow, nsrl, _EXT) +HWY_RVV_FOREACH_U16(HWY_RVV_NARROW, Narrow, nsrl, _EXT) +HWY_RVV_FOREACH_U32(HWY_RVV_NARROW, Narrow, nsrl, _EXT) +#undef HWY_RVV_NARROW + +} // namespace detail + +// Casting to wider and narrowing is the fastest for < 64-bit lanes. +template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> +HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kBits = sizeof(TFromD<D>) * 8; + const Twice<decltype(d)> dt; + const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw; + const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo)); + return BitCast(d, detail::Narrow<kBits>(hl)); +} + +// 64-bit: Combine+Compress. +template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> +HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { + const Twice<decltype(d)> dt; + const VFromD<decltype(dt)> hl = Combine(dt, hi, lo); + return LowerHalf(d, Compress(hl, detail::IsOdd(dt))); +} + +// Any type, max LMUL: Compress both, then Combine. +template <class D, HWY_IF_POW2_GT_D(D, 2)> +HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + const MFromD<D> is_odd = detail::IsOdd(d); + const VFromD<decltype(d)> hi_odd = Compress(hi, is_odd); + const VFromD<decltype(d)> lo_odd = Compress(lo, is_odd); + return Combine(d, LowerHalf(dh, hi_odd), LowerHalf(dh, lo_odd)); +} + +// ------------------------------ ConcatEven (Compress) + +// Casting to wider and narrowing is the fastest for < 64-bit lanes. +template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> +HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { + const Twice<decltype(d)> dt; + const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw; + const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo)); + return BitCast(d, detail::Narrow<0>(hl)); +} + +// 64-bit: Combine+Compress. +template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> +HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { + const Twice<decltype(d)> dt; + const VFromD<decltype(dt)> hl = Combine(dt, hi, lo); + return LowerHalf(d, Compress(hl, detail::IsEven(dt))); +} + +// Any type, max LMUL: Compress both, then Combine. +template <class D, HWY_IF_POW2_GT_D(D, 2)> +HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> dh; + const MFromD<D> is_even = detail::IsEven(d); + const VFromD<decltype(d)> hi_even = Compress(hi, is_even); + const VFromD<decltype(d)> lo_even = Compress(lo, is_even); + return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even)); +} + +// ================================================== BLOCKWISE + +// ------------------------------ CombineShiftRightBytes +template <size_t kBytes, class D, class V = VFromD<D>> +HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) { + const Repartition<uint8_t, decltype(d)> d8; + const auto hi8 = BitCast(d8, hi); + const auto lo8 = BitCast(d8, lo); + const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes); + const auto lo_down = detail::SlideDown(lo8, kBytes); + const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); + return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); +} + +// ------------------------------ CombineShiftRightLanes +template <size_t kLanes, class D, class V = VFromD<D>> +HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) { + constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes; + const auto hi_up = detail::SlideUp(hi, hi, kLanesUp); + const auto lo_down = detail::SlideDown(lo, kLanes); + const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d); + return IfThenElse(is_lo, lo_down, hi_up); +} + +// ------------------------------ Shuffle2301 (ShiftLeft) +template <class V> +HWY_API V Shuffle2301(const V v) { + const DFromV<V> d; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + const Repartition<uint64_t, decltype(d)> du64; + const auto v64 = BitCast(du64, v); + return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64))); +} + +// ------------------------------ Shuffle2103 +template <class V> +HWY_API V Shuffle2103(const V v) { + const DFromV<V> d; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + return CombineShiftRightLanes<3>(d, v, v); +} + +// ------------------------------ Shuffle0321 +template <class V> +HWY_API V Shuffle0321(const V v) { + const DFromV<V> d; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + return CombineShiftRightLanes<1>(d, v, v); +} + +// ------------------------------ Shuffle1032 +template <class V> +HWY_API V Shuffle1032(const V v) { + const DFromV<V> d; + static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); + return CombineShiftRightLanes<2>(d, v, v); +} + +// ------------------------------ Shuffle01 +template <class V> +HWY_API V Shuffle01(const V v) { + const DFromV<V> d; + static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types"); + return CombineShiftRightLanes<1>(d, v, v); +} + +// ------------------------------ Shuffle0123 +template <class V> +HWY_API V Shuffle0123(const V v) { + return Shuffle2301(Shuffle1032(v)); +} + +// ------------------------------ TableLookupBytes + +// Extends or truncates a vector to match the given d. +namespace detail { + +template <class D> +HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) { + return v; +} + +// LMUL of VFromD<D> < LMUL of V: need to truncate v +template <class D, class V, + hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr, + HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)> +HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) { + const DFromV<decltype(v)> d_from; + const Half<decltype(d_from)> dh_from; + static_assert( + DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(), + "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V"); + static_assert( + DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(), + "The LMUL of VFromD<D> must be less than or equal to the LMUL of " + "VFromD<decltype(dh_from)>"); + return ChangeLMUL(d, Trunc(v)); +} + +// LMUL of VFromD<D> > LMUL of V: need to extend v +template <class D, class V, + hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr, + HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())> +HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) { + const DFromV<decltype(v)> d_from; + const Twice<decltype(d_from)> dt_from; + static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(), + "The LMUL of VFromD<decltype(dt_from)> must be greater than " + "the LMUL of V"); + static_assert( + DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(), + "The LMUL of VFromD<D> must be greater than or equal to the LMUL of " + "VFromD<decltype(dt_from)>"); + return ChangeLMUL(d, Ext(dt_from, v)); +} + +} // namespace detail + +template <class VT, class VI> +HWY_API VI TableLookupBytes(const VT vt, const VI vi) { + const DFromV<VT> dt; // T=table, I=index. + const DFromV<VI> di; + const Repartition<uint8_t, decltype(dt)> dt8; + const Repartition<uint8_t, decltype(di)> di8; + // Required for producing half-vectors with table lookups from a full vector. + // If we instead run at the LMUL of the index vector, lookups into the table + // would be truncated. Thus we run at the larger of the two LMULs and truncate + // the result vector to the original index LMUL. + constexpr int kPow2T = dt8.Pow2(); + constexpr int kPow2I = di8.Pow2(); + const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max + const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt)); + const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi)); + auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8)); + // If the table is shorter, wrap around offsets so they do not reference + // undefined lanes in the newly extended vmt. + if (kPow2T < kPow2I) { + offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1)); + } + const auto out = TableLookupLanes(vmt, Add(vmi, offsets)); + return BitCast(di, detail::ChangeLMUL(di8, out)); +} + +template <class VT, class VI> +HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) { + const DFromV<VI> di; + const Repartition<int8_t, decltype(di)> di8; + const auto idx8 = BitCast(di8, idx); + const auto lookup = TableLookupBytes(vt, idx8); + return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup)); +} + +// ------------------------------ TwoTablesLookupLanes + +// TODO(janwas): special-case 8-bit lanes to safely handle VL >= 256 +template <class D, HWY_IF_POW2_LE_D(D, 2)> +HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, + VFromD<RebindToUnsigned<D>> idx) { + const Twice<decltype(d)> dt; + const RebindToUnsigned<decltype(dt)> dt_u; + const auto combined_tbl = Combine(dt, b, a); + const auto combined_idx = Combine(dt_u, idx, idx); + return LowerHalf(d, TableLookupLanes(combined_tbl, combined_idx)); +} + +template <class D, HWY_IF_POW2_GT_D(D, 2)> +HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, + VFromD<RebindToUnsigned<D>> idx) { + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + + const size_t num_of_lanes = Lanes(d); + const auto idx_mod = detail::AndS(idx, static_cast<TU>(num_of_lanes - 1)); + const auto sel_a_mask = Ne(idx, idx_mod); // FALSE if a + + const auto a_lookup_result = TableLookupLanes(a, idx_mod); + return detail::MaskedTableLookupLanes(sel_a_mask, a_lookup_result, b, + idx_mod); +} + +template <class V> +HWY_API V TwoTablesLookupLanes(V a, V b, + VFromD<RebindToUnsigned<DFromV<V>>> idx) { + const DFromV<decltype(a)> d; + return TwoTablesLookupLanes(d, a, b, idx); +} + +// ------------------------------ Broadcast +template <int kLane, class V> +HWY_API V Broadcast(const V v) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; + HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d)); + auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du)); + if (kLane != 0) { + idx = detail::AddS(idx, kLane); + } + return TableLookupLanes(v, idx); +} + +// ------------------------------ ShiftLeftLanes + +template <size_t kLanes, class D, class V = VFromD<D>> +HWY_API V ShiftLeftLanes(const D d, const V v) { + const RebindToSigned<decltype(d)> di; + const RebindToUnsigned<decltype(d)> du; + using TI = TFromD<decltype(di)>; + const auto shifted = detail::SlideUp(v, v, kLanes); + // Match x86 semantics by zeroing lower lanes in 128-bit blocks + const auto idx_mod = + detail::AndS(BitCast(di, detail::Iota0(du)), + static_cast<TI>(detail::LanesPerBlock(di) - 1)); + const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes)); + return IfThenZeroElse(clear, shifted); +} + +template <size_t kLanes, class V> +HWY_API V ShiftLeftLanes(const V v) { + return ShiftLeftLanes<kLanes>(DFromV<V>(), v); +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D> +HWY_API VFromD<D> ShiftLeftBytes(D d, const VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v))); +} + +template <int kBytes, class V> +HWY_API V ShiftLeftBytes(const V v) { + return ShiftLeftBytes<kBytes>(DFromV<V>(), v); +} + +// ------------------------------ ShiftRightLanes +template <size_t kLanes, typename T, size_t N, int kPow2, + class V = VFromD<Simd<T, N, kPow2>>> +HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) { + const RebindToSigned<decltype(d)> di; + const RebindToUnsigned<decltype(d)> du; + using TI = TFromD<decltype(di)>; + // For partial vectors, clear upper lanes so we shift in zeros. + if (N <= 16 / sizeof(T)) { + v = IfThenElseZero(FirstN(d, N), v); + } + + const auto shifted = detail::SlideDown(v, kLanes); + // Match x86 semantics by zeroing upper lanes in 128-bit blocks + const size_t lpb = detail::LanesPerBlock(di); + const auto idx_mod = + detail::AndS(BitCast(di, detail::Iota0(du)), static_cast<TI>(lpb - 1)); + const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes)); + return IfThenElseZero(keep, shifted); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D, class V = VFromD<D>> +HWY_API V ShiftRightBytes(const D d, const V v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v))); +} + +// ------------------------------ InterleaveLower + +template <class D, class V> +HWY_API V InterleaveLower(D d, const V a, const V b) { + static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const auto i = detail::Iota0(du); + const auto idx_mod = ShiftRight<1>( + detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1))); + const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i)); + const auto is_even = detail::EqS(detail::AndS(i, 1), 0u); + return IfThenElse(is_even, TableLookupLanes(a, idx), + TableLookupLanes(b, idx)); +} + +template <class V> +HWY_API V InterleaveLower(const V a, const V b) { + return InterleaveLower(DFromV<V>(), a, b); +} + +// ------------------------------ InterleaveUpper + +template <class D, class V> +HWY_API V InterleaveUpper(const D d, const V a, const V b) { + static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const size_t lpb = detail::LanesPerBlock(du); + const auto i = detail::Iota0(du); + const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1))); + const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i)); + const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2)); + const auto is_even = detail::EqS(detail::AndS(i, 1), 0u); + return IfThenElse(is_even, TableLookupLanes(a, idx), + TableLookupLanes(b, idx)); +} + +// ------------------------------ ZipLower + +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + const RepartitionToNarrow<DW> dn; + static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch"); + return BitCast(dw, InterleaveLower(dn, a, b)); +} + +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} + +// ------------------------------ ZipUpper +template <class DW, class V> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + const RepartitionToNarrow<DW> dn; + static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch"); + return BitCast(dw, InterleaveUpper(dn, a, b)); +} + +// ================================================== REDUCE + +// vector = f(vector, zero_m1) +#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <class D> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \ + return Set(d, \ + GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \ + v, v0, Lanes(d)))); \ + } + +// ------------------------------ SumOfLanes + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL) +} // namespace detail + +template <class D> +HWY_API VFromD<D> SumOfLanes(D d, const VFromD<D> v) { + const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1 + return detail::RedSum(d, v, v0); +} + +template <class D> +HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) { + return GetLane(SumOfLanes(d, v)); +} + +// ------------------------------ MinOfLanes +namespace detail { +HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL) +} // namespace detail + +template <class D> +HWY_API VFromD<D> MinOfLanes(D d, const VFromD<D> v) { + using T = TFromD<D>; + const ScalableTag<T> d1; // always m1 + const auto neutral = Set(d1, HighestValue<T>()); + return detail::RedMin(d, v, neutral); +} + +// ------------------------------ MaxOfLanes +namespace detail { +HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL) +} // namespace detail + +template <class D> +HWY_API VFromD<D> MaxOfLanes(D d, const VFromD<D> v) { + using T = TFromD<D>; + const ScalableTag<T> d1; // always m1 + const auto neutral = Set(d1, LowestValue<T>()); + return detail::RedMax(d, v, neutral); +} + +#undef HWY_RVV_REDUCE + +// ================================================== Ops with dependencies + +// ------------------------------ LoadInterleaved2 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +// Our current implementation uses the old-style vector args for segments. +// Clang will soon implement the tuple form, but GCC only from version 14, +// before which we emulate them in generic_ops-inl.h. +#if HWY_HAVE_TUPLE + +#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ + HWY_RVV_V(BASE, SEW, LMUL) & v0, \ + HWY_RVV_V(BASE, SEW, LMUL) & v1) { \ + __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, unaligned, \ + Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT) +#undef HWY_RVV_LOAD2 + +// ------------------------------ LoadInterleaved3 + +#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ + HWY_RVV_V(BASE, SEW, LMUL) & v0, \ + HWY_RVV_V(BASE, SEW, LMUL) & v1, \ + HWY_RVV_V(BASE, SEW, LMUL) & v2) { \ + __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, unaligned, \ + Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT) +#undef HWY_RVV_LOAD3 + +// ------------------------------ LoadInterleaved4 + +#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned, \ + HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \ + HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \ + __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, &v3, aligned, \ + Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT) +#undef HWY_RVV_LOAD4 + +// ------------------------------ StoreInterleaved2 + +#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \ + HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ + __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT) +#undef HWY_RVV_STORE2 + +// ------------------------------ StoreInterleaved3 + +#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME( \ + HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ + __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, \ + Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT) +#undef HWY_RVV_STORE3 + +// ------------------------------ StoreInterleaved4 + +#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME( \ + HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \ + __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, \ + Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT) +#undef HWY_RVV_STORE4 + +#else // !HWY_HAVE_TUPLE + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1) { + const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] + const VFromD<D> B = LoadU(d, unaligned + Lanes(d)); + v0 = ConcatEven(d, B, A); + v1 = ConcatOdd(d, B, A); +} + +namespace detail { +#define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, stride, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT) +#undef HWY_RVV_LOAD_STRIDED +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { + // Offsets are bytes, and this is not documented. + v0 = detail::LoadStrided(d, unaligned + 0, 3 * sizeof(T)); + v1 = detail::LoadStrided(d, unaligned + 1, 3 * sizeof(T)); + v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T)); +} + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, + VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, + VFromD<D>& v3) { + // Offsets are bytes, and this is not documented. + v0 = detail::LoadStrided(d, unaligned + 0, 4 * sizeof(T)); + v1 = detail::LoadStrided(d, unaligned + 1, 4 * sizeof(T)); + v2 = detail::LoadStrided(d, unaligned + 2, 4 * sizeof(T)); + v3 = detail::LoadStrided(d, unaligned + 3, 4 * sizeof(T)); +} + +// Not 64-bit / max LMUL: interleave via promote, slide, OddEven. +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8), + HWY_IF_POW2_LE_D(D, 2)> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + T* HWY_RESTRICT unaligned) { + const RebindToUnsigned<D> du; + const Twice<RepartitionToWide<decltype(du)>> duw; + const Twice<decltype(d)> dt; + // Interleave with zero by promoting to wider (unsigned) type. + const VFromD<decltype(dt)> w0 = BitCast(dt, PromoteTo(duw, BitCast(du, v0))); + const VFromD<decltype(dt)> w1 = BitCast(dt, PromoteTo(duw, BitCast(du, v1))); + // OR second vector into the zero-valued lanes (faster than OddEven). + StoreU(Or(w0, detail::Slide1Up(w1)), dt, unaligned); +} + +// Can promote, max LMUL: two half-length +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8), + HWY_IF_POW2_GT_D(D, 2)> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + T* HWY_RESTRICT unaligned) { + const Half<decltype(d)> dh; + StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), d, unaligned); + StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), d, + unaligned + Lanes(d)); +} + +namespace detail { +#define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \ + return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, stride, v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT) +#undef HWY_RVV_STORE_STRIDED +} // namespace detail + +// 64-bit: strided +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8)> +HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, + T* HWY_RESTRICT unaligned) { + // Offsets are bytes, and this is not documented. + detail::StoreStrided(v0, d, unaligned + 0, 2 * sizeof(T)); + detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T)); +} + +template <class D, typename T = TFromD<D>> +HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, + T* HWY_RESTRICT unaligned) { + // Offsets are bytes, and this is not documented. + detail::StoreStrided(v0, d, unaligned + 0, 3 * sizeof(T)); + detail::StoreStrided(v1, d, unaligned + 1, 3 * sizeof(T)); + detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T)); +} + +template <class D, typename T = TFromD<D>> +HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { + // Offsets are bytes, and this is not documented. + detail::StoreStrided(v0, d, unaligned + 0, 4 * sizeof(T)); + detail::StoreStrided(v1, d, unaligned + 1, 4 * sizeof(T)); + detail::StoreStrided(v2, d, unaligned + 2, 4 * sizeof(T)); + detail::StoreStrided(v3, d, unaligned + 3, 4 * sizeof(T)); +} + +#endif // HWY_HAVE_TUPLE + +// ------------------------------ ResizeBitCast + +template <class D, class FromV> +HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) { + const DFromV<decltype(v)> d_from; + const Repartition<uint8_t, decltype(d_from)> du8_from; + const DFromV<VFromD<D>> d_to; + const Repartition<uint8_t, decltype(d_to)> du8_to; + return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v))); +} + +// ------------------------------ PopulationCount (ShiftRight) + +// Handles LMUL < 2 or capped vectors, which generic_ops-inl cannot. +template <typename V, class D = DFromV<V>, HWY_IF_U8_D(D), + hwy::EnableIf<D().Pow2() < 1 || D().MaxLanes() < 16>* = nullptr> +HWY_API V PopulationCount(V v) { + // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 + v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55)); + v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33)); + return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F); +} + +// ------------------------------ LoadDup128 + +template <class D> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) { + const RebindToUnsigned<decltype(d)> du; + + // Make sure that no more than 16 bytes are loaded from p + constexpr int kLoadPow2 = d.Pow2(); + constexpr size_t kMaxLanesToLoad = + HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>)); + constexpr size_t kLoadN = D::template NewN<kLoadPow2, kMaxLanesToLoad>(); + const Simd<TFromD<D>, kLoadN, kLoadPow2> d_load; + static_assert(d_load.MaxBytes() <= 16, + "d_load.MaxBytes() <= 16 must be true"); + static_assert((d.MaxBytes() < 16) || (d_load.MaxBytes() == 16), + "d_load.MaxBytes() == 16 must be true if d.MaxBytes() >= 16 is " + "true"); + static_assert((d.MaxBytes() >= 16) || (d_load.MaxBytes() == d.MaxBytes()), + "d_load.MaxBytes() == d.MaxBytes() must be true if " + "d.MaxBytes() < 16 is true"); + + const VFromD<D> loaded = Load(d_load, p); + if (d.MaxBytes() <= 16) return loaded; + + // idx must be unsigned for TableLookupLanes. + using TU = TFromD<decltype(du)>; + const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1); + // Broadcast the first block. + const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(du), mask); + return TableLookupLanes(loaded, idx); +} + +// ------------------------------ LoadMaskBits + +// Support all combinations of T and SHIFT(LMUL) without explicit overloads for +// each. First overload for MLEN=1..64. +namespace detail { + +// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN +// increases with lane size and decreases for increasing LMUL. Cap at 64, the +// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL +// e.g. vuint16mf8_t: (8*2 << 3) == 128. +template <class D> +using MaskTag = hwy::SizeTag<HWY_MIN( + 64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -D().Pow2()))>; + +#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ + HWY_INLINE HWY_RVV_M(MLEN) \ + NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \ + return __riscv_v##OP##_v_b##MLEN(bits, N); \ + } +HWY_RVV_FOREACH_B(HWY_RVV_LOAD_MASK_BITS, LoadMaskBits, lm) +#undef HWY_RVV_LOAD_MASK_BITS +} // namespace detail + +template <class D, class MT = detail::MaskTag<D>> +HWY_API auto LoadMaskBits(D d, const uint8_t* bits) + -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) { + return detail::LoadMaskBits(MT(), bits, Lanes(d)); +} + +// ------------------------------ StoreMaskBits +#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ + template <class D> \ + HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \ + const size_t N = Lanes(d); \ + __riscv_v##OP##_v_b##MLEN(bits, m, N); \ + /* Non-full byte, need to clear the undefined upper bits. */ \ + /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \ + constexpr bool kLessThan8 = \ + detail::ScaleByPower(16 / sizeof(TFromD<D>), d.Pow2()) < 8; \ + if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \ + const int mask = (1 << N) - 1; \ + bits[0] = static_cast<uint8_t>(bits[0] & mask); \ + } \ + return (N + 7) / 8; \ + } +HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, StoreMaskBits, sm) +#undef HWY_RVV_STORE_MASK_BITS + +// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits) + +template <class V> +HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(DFromV<V>(), bits)); +} + +template <class D> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp) + +// Disallow for 8-bit because Iota is likely to overflow. +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API MFromD<D> FirstN(const D d, const size_t n) { + const RebindToUnsigned<D> du; + using TU = TFromD<decltype(du)>; + return RebindMask(d, detail::LtS(detail::Iota0(du), static_cast<TU>(n))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_API MFromD<D> FirstN(const D d, const size_t n) { + const auto zero = Zero(d); + const auto one = Set(d, 1); + return Eq(detail::SlideUp(one, zero, n), one); +} + +// ------------------------------ Neg (Sub) + +template <class V, HWY_IF_SIGNED_V(V)> +HWY_API V Neg(const V v) { + return detail::ReverseSubS(v, 0); +} + +// vector = f(vector), but argument is repeated +#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL) + +// ------------------------------ Abs (Max, Neg) + +template <class V, HWY_IF_SIGNED_V(V)> +HWY_API V Abs(const V v) { + return Max(v, Neg(v)); +} + +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL) + +#undef HWY_RVV_RETV_ARGV2 + +// ------------------------------ AbsDiff (Abs, Sub) +template <class V, HWY_IF_FLOAT_V(V)> +HWY_API V AbsDiff(const V a, const V b) { + return Abs(Sub(a, b)); +} + +// ------------------------------ Round (NearestInt, ConvertTo, CopySign) + +// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have +// a dedicated instruction for that. Rounding to integer and converting back to +// float is correct except when the input magnitude is large, in which case the +// input was already an integer (because mantissa >> exponent is zero). + +namespace detail { +enum RoundingModes { kNear, kTrunc, kDown, kUp }; + +template <class V> +HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) { + return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>()); +} + +} // namespace detail + +template <class V> +HWY_API V Round(const V v) { + const DFromV<V> df; + + const auto integer = NearestInt(v); // round using current mode + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); +} + +// ------------------------------ Trunc (ConvertTo) +template <class V> +HWY_API V Trunc(const V v) { + const DFromV<V> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); +} + +// ------------------------------ Ceil +template <class V> +HWY_API V Ceil(const V v) { + asm volatile("fsrm %0" ::"r"(detail::kUp)); + const auto ret = Round(v); + asm volatile("fsrm %0" ::"r"(detail::kNear)); + return ret; +} + +// ------------------------------ Floor +template <class V> +HWY_API V Floor(const V v) { + asm volatile("fsrm %0" ::"r"(detail::kDown)); + const auto ret = Round(v); + asm volatile("fsrm %0" ::"r"(detail::kNear)); + return ret; +} + +// ------------------------------ Floating-point classification (Ne) + +// vfclass does not help because it would require 3 instructions (to AND and +// then compare the bits), whereas these are just 1-3 integer instructions. + +template <class V> +HWY_API MFromD<DFromV<V>> IsNaN(const V v) { + return Ne(v, v); +} + +template <class V, class D = DFromV<V>> +HWY_API MFromD<D> IsInf(const V v) { + const D d; + const RebindToSigned<decltype(d)> di; + using T = TFromD<D>; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>())); +} + +// Returns whether normal/subnormal/zero. +template <class V, class D = DFromV<V>> +HWY_API MFromD<D> IsFinite(const V v) { + const D d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + using T = TFromD<D>; + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); + return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>())); +} + +// ------------------------------ Iota (ConvertTo) + +template <class D, HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> Iota(const D d, TFromD<D> first) { + return detail::AddS(detail::Iota0(d), first); +} + +template <class D, HWY_IF_SIGNED_D(D)> +HWY_API VFromD<D> Iota(const D d, TFromD<D> first) { + const RebindToUnsigned<D> du; + return detail::AddS(BitCast(d, detail::Iota0(du)), first); +} + +template <class D, HWY_IF_FLOAT_D(D)> +HWY_API VFromD<D> Iota(const D d, TFromD<D> first) { + const RebindToUnsigned<D> du; + const RebindToSigned<D> di; + return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first); +} + +// ------------------------------ MulEven/Odd (Mul, OddEven) + +template <class V, HWY_IF_T_SIZE_V(V, 4), class D = DFromV<V>, + class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> MulEven(const V a, const V b) { + const auto lo = Mul(a, b); + const auto hi = detail::MulHigh(a, b); + return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo)); +} + +// There is no 64x64 vwmul. +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_INLINE V MulEven(const V a, const V b) { + const auto lo = Mul(a, b); + const auto hi = detail::MulHigh(a, b); + return OddEven(detail::Slide1Up(hi), lo); +} + +template <class V, HWY_IF_T_SIZE_V(V, 8)> +HWY_INLINE V MulOdd(const V a, const V b) { + const auto lo = Mul(a, b); + const auto hi = detail::MulHigh(a, b); + return OddEven(hi, detail::Slide1Down(lo)); +} + +// ------------------------------ ReorderDemote2To (OddEven, Combine) + +template <size_t N, int kPow2> +HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To( + Simd<bfloat16_t, N, kPow2> dbf16, + VFromD<RepartitionToWide<decltype(dbf16)>> a, + VFromD<RepartitionToWide<decltype(dbf16)>> b) { + const RebindToUnsigned<decltype(dbf16)> du16; + const RebindToUnsigned<DFromV<decltype(a)>> du32; + const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +// If LMUL is not the max, Combine first to avoid another DemoteTo. +template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), + HWY_IF_POW2_LE_D(DN, 2), class V, HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Rebind<TFromV<V>, DN> dt; + const VFromD<decltype(dt)> ab = Combine(dt, b, a); + return DemoteTo(dn, ab); +} + +template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V, + HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Rebind<TFromV<V>, DN> dt; + const VFromD<decltype(dt)> ab = Combine(dt, b, a); + return DemoteTo(dn, ab); +} + +// Max LMUL: must DemoteTo first, then Combine. +template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), + HWY_IF_POW2_GT_D(DN, 2), class V, HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Half<decltype(dn)> dnh; + const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a); + const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b); + return Combine(dn, demoted_b, demoted_a); +} + +template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V, + HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Half<decltype(dn)> dnh; + const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a); + const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b); + return Combine(dn, demoted_b, demoted_a); +} + +// If LMUL is not the max, Combine first to avoid another DemoteTo. +template <class DN, HWY_IF_BF16_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V, + HWY_IF_F32_D(DFromV<V>), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { + const Rebind<TFromV<V>, DN> dt; + const VFromD<decltype(dt)> ab = Combine(dt, b, a); + return DemoteTo(dn, ab); +} + +// Max LMUL: must DemoteTo first, then Combine. +template <class DN, HWY_IF_BF16_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V, + HWY_IF_F32_D(DFromV<V>), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { + const Half<decltype(dn)> dnh; + const RebindToUnsigned<decltype(dn)> dn_u; + const RebindToUnsigned<decltype(dnh)> dnh_u; + const auto demoted_a = BitCast(dnh_u, DemoteTo(dnh, a)); + const auto demoted_b = BitCast(dnh_u, DemoteTo(dnh, b)); + return BitCast(dn, Combine(dn_u, demoted_b, demoted_a)); +} + +template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V, + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), + class V2 = VFromD<Repartition<TFromV<V>, DN>>, + hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> +HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { + return ReorderDemote2To(dn, a, b); +} + +// ------------------------------ WidenMulPairwiseAdd + +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { + const RebindToUnsigned<decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 + // Using shift/and instead of Zip leads to the odd/even order that + // RearrangeToOddPlusEven prefers. + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), + Mul(BitCast(df32, ao), BitCast(df32, bo))); +} + +template <class D, HWY_IF_I32_D(D), class VI16> +HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) { + using VI32 = VFromD<decltype(d32)>; + // Manual sign extension requires two shifts for even lanes. + const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a))); + const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b))); + const VI32 ao = ShiftRight<16>(BitCast(d32, a)); + const VI32 bo = ShiftRight<16>(BitCast(d32, b)); + return Add(Mul(ae, be), Mul(ao, bo)); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +namespace detail { + +// Non-overloaded wrapper function so we can define DF32 in template args. +template < + size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>, + class VF32 = VFromD<DF32>, + class DU16 = RepartitionToNarrow<RebindToUnsigned<Simd<float, N, kPow2>>>> +HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32, + VFromD<DU16> a, VFromD<DU16> b, + const VF32 sum0, VF32& sum1) { + const RebindToUnsigned<DF32> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 + // Using shift/and instead of Zip leads to the odd/even order that + // RearrangeToOddPlusEven prefers. + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template <size_t N> \ + HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ + HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \ + HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + return __riscv_v##OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \ + } + +HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmacc_vv_, _EXT_VIRT) +#undef HWY_RVV_WIDEN_MACC + +// If LMUL is not the max, we can WidenMul first (3 instructions). +template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>, + class D16 = RepartitionToNarrow<D32>> +HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a, + VFromD<D16> b, const V32 sum0, + V32& sum1) { + const Twice<decltype(d32)> d32t; + using V32T = VFromD<decltype(d32t)>; + V32T sum = Combine(d32t, sum1, sum0); + sum = detail::WidenMulAcc(d32t, sum, a, b); + sum1 = UpperHalf(d32, sum); + return LowerHalf(d32, sum); +} + +// Max LMUL: must LowerHalf first (4 instructions). +template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>, + class D16 = RepartitionToNarrow<D32>> +HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a, + VFromD<D16> b, const V32 sum0, + V32& sum1) { + const Half<D16> d16h; + using V16H = VFromD<decltype(d16h)>; + const V16H a0 = LowerHalf(d16h, a); + const V16H a1 = UpperHalf(d16h, a); + const V16H b0 = LowerHalf(d16h, b); + const V16H b1 = UpperHalf(d16h, b); + sum1 = detail::WidenMulAcc(d32, sum1, a1, b1); + return detail::WidenMulAcc(d32, sum0, a0, b0); +} + +} // namespace detail + +template <size_t N, int kPow2, class VN, class VW> +HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b, + const VW sum0, VW& sum1) { + return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1); +} + +template <size_t N, int kPow2, class VN, class VW> +HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b, + const VW sum0, VW& sum1) { + return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1); +} + +// ------------------------------ RearrangeToOddPlusEven + +template <class VW, HWY_IF_SIGNED_V(VW)> // vint32_t* +HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { + // vwmacc doubles LMUL, so we require a pairwise sum here. This op is + // expected to be less frequent than ReorderWidenMulAccumulate, hence it's + // preferable to do the extra work here rather than do manual odd/even + // extraction there. + const DFromV<VW> di32; + const RebindToUnsigned<decltype(di32)> du32; + const Twice<decltype(di32)> di32x2; + const RepartitionToWide<decltype(di32x2)> di64x2; + const RebindToUnsigned<decltype(di64x2)> du64x2; + const auto combined = BitCast(di64x2, Combine(di32x2, sum1, sum0)); + // Isolate odd/even int32 in int64 lanes. + const auto even = ShiftRight<32>(ShiftLeft<32>(combined)); // sign extend + const auto odd = ShiftRight<32>(combined); + return BitCast(di32, TruncateTo(du32, BitCast(du64x2, Add(even, odd)))); +} + +// For max LMUL, we cannot Combine again and instead manually unroll. +HWY_API vint32m8_t RearrangeToOddPlusEven(vint32m8_t sum0, vint32m8_t sum1) { + const DFromV<vint32m8_t> d; + const Half<decltype(d)> dh; + const vint32m4_t lo = + RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0)); + const vint32m4_t hi = + RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1)); + return Combine(d, hi, lo); +} + +template <class VW, HWY_IF_FLOAT_V(VW)> // vfloat* +HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { + return Add(sum0, sum1); // invariant already holds +} + +// ------------------------------ Lt128 +template <class D> +HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + // Truth table of Eq and Compare for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); + // Shift leftward so L can influence H. + const VFromD<D> ltLx = detail::Slide1Up(ltHL); + const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx))); +} + +// ------------------------------ Lt128Upper +template <class D> +HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); + const VFromD<D> down = detail::Slide1Down(ltHL); + // b(267743505): Clang compiler bug, workaround is DoNotOptimize + asm volatile("" : : "r,m"(GetLane(down)) : "memory"); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(ltHL, down)); +} + +// ------------------------------ Eq128 +template <class D> +HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + const VFromD<D> eqLH = Reverse2(d, eqHL); + const VFromD<D> eq = And(eqHL, eqLH); + // b(267743505): Clang compiler bug, workaround is DoNotOptimize + asm volatile("" : : "r,m"(GetLane(eq)) : "memory"); + return MaskFromVec(eq); +} + +// ------------------------------ Eq128Upper +template <class D> +HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL))); +} + +// ------------------------------ Ne128 +template <class D> +HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); + const VFromD<D> neLH = Reverse2(d, neHL); + // b(267743505): Clang compiler bug, workaround is DoNotOptimize + asm volatile("" : : "r,m"(GetLane(neLH)) : "memory"); + return MaskFromVec(Or(neHL, neLH)); +} + +// ------------------------------ Ne128Upper +template <class D> +HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); + const VFromD<D> down = detail::Slide1Down(neHL); + // b(267743505): Clang compiler bug, workaround is DoNotOptimize + asm volatile("" : : "r,m"(GetLane(down)) : "memory"); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(neHL, down)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +template <class D> +HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) { + const VFromD<D> aXH = detail::Slide1Down(a); + const VFromD<D> bXH = detail::Slide1Down(b); + const VFromD<D> minHL = Min(a, b); + const MFromD<D> ltXH = Lt(aXH, bXH); + const MFromD<D> eqXH = Eq(aXH, bXH); + // If the upper lane is the decider, take lo from the same reg. + const VFromD<D> lo = IfThenElse(ltXH, a, b); + // The upper lane is just minHL; if they are equal, we also need to use the + // actual min of the lower lanes. + return OddEven(minHL, IfThenElse(eqXH, minHL, lo)); +} + +template <class D> +HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) { + const VFromD<D> aXH = detail::Slide1Down(a); + const VFromD<D> bXH = detail::Slide1Down(b); + const VFromD<D> maxHL = Max(a, b); + const MFromD<D> ltXH = Lt(aXH, bXH); + const MFromD<D> eqXH = Eq(aXH, bXH); + // If the upper lane is the decider, take lo from the same reg. + const VFromD<D> lo = IfThenElse(ltXH, b, a); + // The upper lane is just maxHL; if they are equal, we also need to use the + // actual min of the lower lanes. + return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo)); +} + +template <class D> +HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// ================================================== END MACROS +namespace detail { // for code folding +#undef HWY_RVV_AVL +#undef HWY_RVV_D +#undef HWY_RVV_FOREACH +#undef HWY_RVV_FOREACH_08_ALL +#undef HWY_RVV_FOREACH_08_ALL_VIRT +#undef HWY_RVV_FOREACH_08_DEMOTE +#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_08_EXT +#undef HWY_RVV_FOREACH_08_EXT_VIRT +#undef HWY_RVV_FOREACH_08_TRUNC +#undef HWY_RVV_FOREACH_08_VIRT +#undef HWY_RVV_FOREACH_16_ALL +#undef HWY_RVV_FOREACH_16_ALL_VIRT +#undef HWY_RVV_FOREACH_16_DEMOTE +#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_16_EXT +#undef HWY_RVV_FOREACH_16_EXT_VIRT +#undef HWY_RVV_FOREACH_16_TRUNC +#undef HWY_RVV_FOREACH_16_VIRT +#undef HWY_RVV_FOREACH_32_ALL +#undef HWY_RVV_FOREACH_32_ALL_VIRT +#undef HWY_RVV_FOREACH_32_DEMOTE +#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_32_EXT +#undef HWY_RVV_FOREACH_32_EXT_VIRT +#undef HWY_RVV_FOREACH_32_TRUNC +#undef HWY_RVV_FOREACH_32_VIRT +#undef HWY_RVV_FOREACH_64_ALL +#undef HWY_RVV_FOREACH_64_ALL_VIRT +#undef HWY_RVV_FOREACH_64_DEMOTE +#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_64_EXT +#undef HWY_RVV_FOREACH_64_EXT_VIRT +#undef HWY_RVV_FOREACH_64_TRUNC +#undef HWY_RVV_FOREACH_64_VIRT +#undef HWY_RVV_FOREACH_B +#undef HWY_RVV_FOREACH_F +#undef HWY_RVV_FOREACH_F16 +#undef HWY_RVV_FOREACH_F32 +#undef HWY_RVV_FOREACH_F3264 +#undef HWY_RVV_FOREACH_F64 +#undef HWY_RVV_FOREACH_I +#undef HWY_RVV_FOREACH_I08 +#undef HWY_RVV_FOREACH_I16 +#undef HWY_RVV_FOREACH_I163264 +#undef HWY_RVV_FOREACH_I32 +#undef HWY_RVV_FOREACH_I64 +#undef HWY_RVV_FOREACH_U +#undef HWY_RVV_FOREACH_U08 +#undef HWY_RVV_FOREACH_U16 +#undef HWY_RVV_FOREACH_U163264 +#undef HWY_RVV_FOREACH_U32 +#undef HWY_RVV_FOREACH_U64 +#undef HWY_RVV_FOREACH_UI +#undef HWY_RVV_FOREACH_UI08 +#undef HWY_RVV_FOREACH_UI16 +#undef HWY_RVV_FOREACH_UI163264 +#undef HWY_RVV_FOREACH_UI32 +#undef HWY_RVV_FOREACH_UI3264 +#undef HWY_RVV_FOREACH_UI64 +#undef HWY_RVV_M +#undef HWY_RVV_RETM_ARGM +#undef HWY_RVV_RETV_ARGV +#undef HWY_RVV_RETV_ARGVS +#undef HWY_RVV_RETV_ARGVV +#undef HWY_RVV_T +#undef HWY_RVV_V +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/scalar-inl.h b/third_party/highway/hwy/ops/scalar-inl.h new file mode 100644 index 0000000000..cef88df3ce --- /dev/null +++ b/third_party/highway/hwy/ops/scalar-inl.h @@ -0,0 +1,1845 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Single-element vectors and operations. +// External include guard in highway.h - see comment there. + +#ifndef HWY_NO_LIBCXX +#include <math.h> // sqrtf +#endif + +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Single instruction, single data. +template <typename T> +using Sisd = Simd<T, 1, 0>; + +// (Wrapper class required for overloading comparison operators.) +template <typename T> +struct Vec1 { + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = 1; // only for DFromV + + HWY_INLINE Vec1() = default; + Vec1(const Vec1&) = default; + Vec1& operator=(const Vec1&) = default; + HWY_INLINE explicit Vec1(const T t) : raw(t) {} + + HWY_INLINE Vec1& operator*=(const Vec1 other) { + return *this = (*this * other); + } + HWY_INLINE Vec1& operator/=(const Vec1 other) { + return *this = (*this / other); + } + HWY_INLINE Vec1& operator+=(const Vec1 other) { + return *this = (*this + other); + } + HWY_INLINE Vec1& operator-=(const Vec1 other) { + return *this = (*this - other); + } + HWY_INLINE Vec1& operator&=(const Vec1 other) { + return *this = (*this & other); + } + HWY_INLINE Vec1& operator|=(const Vec1 other) { + return *this = (*this | other); + } + HWY_INLINE Vec1& operator^=(const Vec1 other) { + return *this = (*this ^ other); + } + + T raw; +}; + +// 0 or FF..FF, same size as Vec1. +template <typename T> +class Mask1 { + using Raw = hwy::MakeUnsigned<T>; + + public: + static HWY_INLINE Mask1<T> FromBool(bool b) { + Mask1<T> mask; + mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0; + return mask; + } + + Raw bits; +}; + +template <class V> +using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; + +template <class V> +using TFromV = typename V::PrivateT; + +// ------------------------------ BitCast + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> +HWY_API Vec1<TTo> BitCast(DTo /* tag */, Vec1<TFrom> v) { + static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined"); + TTo to; + CopyBytes<sizeof(TTo)>(&v.raw, &to); // not same size - ok to shrink + return Vec1<TTo>(to); +} + +// ------------------------------ Zero + +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Vec1<T> Zero(D /* tag */) { + return Vec1<T>(T(0)); +} + +template <class D> +using VFromD = decltype(Zero(D())); + +// ------------------------------ Tuple (VFromD) +#include "hwy/ops/tuple-inl.h" + +// ------------------------------ Set +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2> +HWY_API Vec1<T> Set(D /* tag */, const T2 t) { + return Vec1<T>(static_cast<T>(t)); +} + +// ------------------------------ Undefined +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Vec1<T> Undefined(D d) { + return Zero(d); +} + +// ------------------------------ Iota +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2> +HWY_API Vec1<T> Iota(const D /* tag */, const T2 first) { + return Vec1<T>(static_cast<T>(first)); +} + +// ------------------------------ ResizeBitCast + +template <class D, typename FromV> +HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) { + using TFrom = TFromV<FromV>; + using TTo = TFromD<D>; + constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo)); + TTo to = TTo{0}; + CopyBytes<kCopyLen>(&v.raw, &to); + return VFromD<D>(to); +} + +namespace detail { + +// ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if +// sizeof(TFromD<DTo>) is greater than sizeof(TFromV<FromV>) +template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + FromSizeTag /* from_size_tag */, ToSizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, + VFromD<DFrom> v) { + return ResizeBitCast(d_to, v); +} + +} // namespace scalar + + +// ================================================== LOGICAL + +// ------------------------------ Not + +template <typename T> +HWY_API Vec1<T> Not(const Vec1<T> v) { + using TU = MakeUnsigned<T>; + const Sisd<TU> du; + return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw))); +} + +// ------------------------------ And + +template <typename T> +HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) { + using TU = MakeUnsigned<T>; + const Sisd<TU> du; + return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw)); +} +template <typename T> +HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) { + return And(a, b); +} + +// ------------------------------ AndNot + +template <typename T> +HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) { + using TU = MakeUnsigned<T>; + const Sisd<TU> du; + return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw & + BitCast(du, b).raw))); +} + +// ------------------------------ Or + +template <typename T> +HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) { + using TU = MakeUnsigned<T>; + const Sisd<TU> du; + return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw)); +} +template <typename T> +HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) { + return Or(a, b); +} + +// ------------------------------ Xor + +template <typename T> +HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) { + using TU = MakeUnsigned<T>; + const Sisd<TU> du; + return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw)); +} +template <typename T> +HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) { + return Xor(a, b); +} + +// ------------------------------ Xor3 + +template <typename T> +HWY_API Vec1<T> Xor3(Vec1<T> x1, Vec1<T> x2, Vec1<T> x3) { + return Xor(x1, Xor(x2, x3)); +} + +// ------------------------------ Or3 + +template <typename T> +HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template <typename T> +HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ Mask + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> +HWY_API Mask1<TTo> RebindMask(DTo /*tag*/, Mask1<TFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask1<TTo>{m.bits}; +} + +// v must be 0 or FF..FF. +template <typename T> +HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) { + Mask1<T> mask; + CopySameSize(&v, &mask); + return mask; +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +template <typename T> +Vec1<T> VecFromMask(const Mask1<T> mask) { + Vec1<T> v; + CopySameSize(&mask, &v); + return v; +} + +template <class D, typename T = TFromD<D>> +Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) { + Vec1<T> v; + CopySameSize(&mask, &v); + return v; +} + +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) { + return Mask1<T>::FromBool(n != 0); +} + +// ------------------------------ IfVecThenElse + +template <typename T> +HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ CopySign + +template <typename T> +HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + const auto msb = SignBit(Sisd<T>()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template <typename T> +HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(Sisd<T>()), sign)); +} + +// ------------------------------ BroadcastSignBit + +template <typename T> +HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) { + // This is used inside ShiftRight, so we cannot implement in terms of it. + return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0); +} + +// ------------------------------ PopulationCount + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +template <typename T> +HWY_API Vec1<T> PopulationCount(Vec1<T> v) { + return Vec1<T>(static_cast<T>(PopCount(v.raw))); +} + +// ------------------------------ IfThenElse + +// Returns mask ? yes : no. +template <typename T> +HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes, + const Vec1<T> no) { + return mask.bits ? yes : no; +} + +template <typename T> +HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) { + return mask.bits ? yes : Vec1<T>(0); +} + +template <typename T> +HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) { + return mask.bits ? Vec1<T>(0) : no; +} + +template <typename T> +HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) { + return v.raw < 0 ? yes : no; +} + +template <typename T> +HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) { + return v.raw < 0 ? Vec1<T>(0) : v; +} + +// ------------------------------ Mask logical + +template <typename T> +HWY_API Mask1<T> Not(const Mask1<T> m) { + return MaskFromVec(Not(VecFromMask(Sisd<T>(), m))); +} + +template <typename T> +HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) { + const Sisd<T> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) { + const Sisd<T> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) { + const Sisd<T> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) { + const Sisd<T> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) { + const Sisd<T> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ================================================== SHIFTS + +// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) + +template <int kBits, typename T> +HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + return Vec1<T>( + static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits)); +} + +template <int kBits, typename T> +HWY_API Vec1<T> ShiftRight(const Vec1<T> v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + return Vec1<T>(static_cast<T>(v.raw >> kBits)); +#else + if (IsSigned<T>()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned<T>; + const Sisd<TU> du; + const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits); + const TU sign = BitCast(du, BroadcastSignBit(v)).raw; + const size_t sign_shift = + static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits); + const TU upper = static_cast<TU>(sign << sign_shift); + return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper)); + } else { // T is unsigned + return Vec1<T>(static_cast<T>(v.raw >> kBits)); + } +#endif +} + +// ------------------------------ RotateRight (ShiftRight) +template <int kBits, typename T> +HWY_API Vec1<T> RotateRight(const Vec1<T> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +// ------------------------------ ShiftLeftSame (BroadcastSignBit) + +template <typename T> +HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) { + return Vec1<T>( + static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits)); +} + +template <typename T> +HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) { +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + return Vec1<T>(static_cast<T>(v.raw >> bits)); +#else + if (IsSigned<T>()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned<T>; + const Sisd<TU> du; + const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits); + const TU sign = BitCast(du, BroadcastSignBit(v)).raw; + const size_t sign_shift = + static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits); + const TU upper = static_cast<TU>(sign << sign_shift); + return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper)); + } else { // T is unsigned + return Vec1<T>(static_cast<T>(v.raw >> bits)); + } +#endif +} + +// ------------------------------ Shl + +// Single-lane => same as ShiftLeftSame except for the argument type. +template <typename T> +HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) { + return ShiftLeftSame(v, static_cast<int>(bits.raw)); +} + +template <typename T> +HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) { + return ShiftRightSame(v, static_cast<int>(bits.raw)); +} + +// ================================================== ARITHMETIC + +template <typename T> +HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) { + const uint64_t a64 = static_cast<uint64_t>(a.raw); + const uint64_t b64 = static_cast<uint64_t>(b.raw); + return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)))); +} +HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) { + return Vec1<float>(a.raw + b.raw); +} +HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) { + return Vec1<double>(a.raw + b.raw); +} + +template <typename T> +HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) { + const uint64_t a64 = static_cast<uint64_t>(a.raw); + const uint64_t b64 = static_cast<uint64_t>(b.raw); + return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)))); +} +HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) { + return Vec1<float>(a.raw - b.raw); +} +HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) { + return Vec1<double>(a.raw - b.raw); +} + +// ------------------------------ SumsOf8 + +HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) { + return Vec1<uint64_t>(v.raw); +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a, + const Vec1<uint8_t> b) { + return Vec1<uint8_t>( + static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); +} +HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a, + const Vec1<uint16_t> b) { + return Vec1<uint16_t>(static_cast<uint16_t>( + HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) + b.raw), 65535))); +} + +// Signed +HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) { + return Vec1<int8_t>( + static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); +} +HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a, + const Vec1<int16_t> b) { + return Vec1<int16_t>(static_cast<int16_t>( + HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) + b.raw), 32767))); +} + +// ------------------------------ Saturating subtraction + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a, + const Vec1<uint8_t> b) { + return Vec1<uint8_t>( + static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); +} +HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a, + const Vec1<uint16_t> b) { + return Vec1<uint16_t>(static_cast<uint16_t>( + HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) - b.raw), 65535))); +} + +// Signed +HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) { + return Vec1<int8_t>( + static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); +} +HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a, + const Vec1<int16_t> b) { + return Vec1<int16_t>(static_cast<int16_t>( + HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) - b.raw), 32767))); +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +HWY_API Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a, + const Vec1<uint8_t> b) { + return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2)); +} +HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a, + const Vec1<uint16_t> b) { + return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2)); +} + +// ------------------------------ Absolute value + +template <typename T> +HWY_API Vec1<T> Abs(const Vec1<T> a) { + const T i = a.raw; + if (i >= 0 || i == hwy::LimitsMin<T>()) return a; + return Vec1<T>(static_cast<T>(-i & T{-1})); +} +HWY_API Vec1<float> Abs(Vec1<float> a) { + int32_t i; + CopyBytes<sizeof(i)>(&a.raw, &i); + i &= 0x7FFFFFFF; + CopyBytes<sizeof(i)>(&i, &a.raw); + return a; +} +HWY_API Vec1<double> Abs(Vec1<double> a) { + int64_t i; + CopyBytes<sizeof(i)>(&a.raw, &i); + i &= 0x7FFFFFFFFFFFFFFFL; + CopyBytes<sizeof(i)>(&i, &a.raw); + return a; +} + +// ------------------------------ Min/Max + +// <cmath> may be unavailable, so implement our own. +namespace detail { + +static inline float Abs(float f) { + uint32_t i; + CopyBytes<4>(&f, &i); + i &= 0x7FFFFFFFu; + CopyBytes<4>(&i, &f); + return f; +} +static inline double Abs(double f) { + uint64_t i; + CopyBytes<8>(&f, &i); + i &= 0x7FFFFFFFFFFFFFFFull; + CopyBytes<8>(&i, &f); + return f; +} + +static inline bool SignBit(float f) { + uint32_t i; + CopyBytes<4>(&f, &i); + return (i >> 31) != 0; +} +static inline bool SignBit(double f) { + uint64_t i; + CopyBytes<8>(&f, &i); + return (i >> 63) != 0; +} + +} // namespace detail + +template <typename T, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) { + return Vec1<T>(HWY_MIN(a.raw, b.raw)); +} + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) { + if (isnan(a.raw)) return b; + if (isnan(b.raw)) return a; + return Vec1<T>(HWY_MIN(a.raw, b.raw)); +} + +template <typename T, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) { + return Vec1<T>(HWY_MAX(a.raw, b.raw)); +} + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) { + if (isnan(a.raw)) return b; + if (isnan(b.raw)) return a; + return Vec1<T>(HWY_MAX(a.raw, b.raw)); +} + +// ------------------------------ Floating-point negate + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec1<T> Neg(const Vec1<T> v) { + return Xor(v, SignBit(Sisd<T>())); +} + +template <typename T, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec1<T> Neg(const Vec1<T> v) { + return Zero(Sisd<T>()) - v; +} + +// ------------------------------ mul/div + +// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. +#ifdef HWY_NATIVE_MUL_8 +#undef HWY_NATIVE_MUL_8 +#else +#define HWY_NATIVE_MUL_8 +#endif +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) { + return Vec1<T>(static_cast<T>(double{a.raw} * b.raw)); +} + +template <typename T, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) { + return Vec1<T>(static_cast<T>(static_cast<uint64_t>(a.raw) * + static_cast<uint64_t>(b.raw))); +} + +template <typename T> +HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) { + return Vec1<T>(a.raw / b.raw); +} + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) { + return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16)); +} +HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) { + // Cast to uint32_t first to prevent overflow. Otherwise the result of + // uint16_t * uint16_t is in "int" which may overflow. In practice the result + // is the same but this way it is also defined. + return Vec1<uint16_t>(static_cast<uint16_t>( + (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16)); +} + +HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) { + return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16)); +} + +// Multiplies even lanes (0, 2 ..) and returns the double-wide result. +HWY_API Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) { + const int64_t a64 = a.raw; + return Vec1<int64_t>(a64 * b.raw); +} +HWY_API Vec1<uint64_t> MulEven(const Vec1<uint32_t> a, const Vec1<uint32_t> b) { + const uint64_t a64 = a.raw; + return Vec1<uint64_t>(a64 * b.raw); +} + +// Approximate reciprocal +HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) { + // Zero inputs are allowed, but callers are responsible for replacing the + // return value with something else (typically using IfThenElse). This check + // avoids a ubsan error. The return value is arbitrary. + if (v.raw == 0.0f) return Vec1<float>(0.0f); + return Vec1<float>(1.0f / v.raw); +} + +// Absolute value of difference. +HWY_API Vec1<float> AbsDiff(const Vec1<float> a, const Vec1<float> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +template <typename T> +HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) { + return mul * x + add; +} + +template <typename T> +HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x, + const Vec1<T> add) { + return add - mul * x; +} + +template <typename T> +HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) { + return mul * x - sub; +} + +template <typename T> +HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x, + const Vec1<T> sub) { + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +// Approximate reciprocal square root +HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) { + float f = v.raw; + const float half = f * 0.5f; + uint32_t bits; + CopySameSize(&f, &bits); + // Initial guess based on log2(f) + bits = 0x5F3759DF - (bits >> 1); + CopySameSize(&bits, &f); + // One Newton-Raphson iteration + return Vec1<float>(f * (1.5f - (half * f * f))); +} + +// Square root +HWY_API Vec1<float> Sqrt(Vec1<float> v) { +#if defined(HWY_NO_LIBCXX) +#if HWY_COMPILER_GCC_ACTUAL + return Vec1<float>(__builtin_sqrt(v.raw)); +#else + uint32_t bits; + CopyBytes<sizeof(bits)>(&v, &bits); + // Coarse approximation, letting the exponent LSB leak into the mantissa + bits = (1 << 29) + (bits >> 1) - (1 << 22); + CopyBytes<sizeof(bits)>(&bits, &v); + return v; +#endif // !HWY_COMPILER_GCC_ACTUAL +#else + return Vec1<float>(sqrtf(v.raw)); +#endif // !HWY_NO_LIBCXX +} +HWY_API Vec1<double> Sqrt(Vec1<double> v) { +#if defined(HWY_NO_LIBCXX) +#if HWY_COMPILER_GCC_ACTUAL + return Vec1<double>(__builtin_sqrt(v.raw)); +#else + uint64_t bits; + CopyBytes<sizeof(bits)>(&v, &bits); + // Coarse approximation, letting the exponent LSB leak into the mantissa + bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51); + CopyBytes<sizeof(bits)>(&bits, &v); + return v; +#endif // !HWY_COMPILER_GCC_ACTUAL +#else + return Vec1<double>(sqrt(v.raw)); +#endif // HWY_NO_LIBCXX +} + +// ------------------------------ Floating-point rounding + +template <typename T> +HWY_API Vec1<T> Round(const Vec1<T> v) { + using TI = MakeSigned<T>; + if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN + return v; + } + const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast<TI>(v.raw + bias); + if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v); + // Round to even + if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) { + return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1))); + } + return Vec1<T>(static_cast<T>(rounded)); +} + +// Round-to-nearest even. +HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) { + using T = float; + using TI = int32_t; + + const T abs = Abs(v).raw; + const bool is_sign = detail::SignBit(v.raw); + + if (!(abs < MantissaEnd<T>())) { // Huge or NaN + // Check if too large to cast or NaN + if (!(abs <= static_cast<T>(LimitsMax<TI>()))) { + return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>()); + } + return Vec1<int32_t>(static_cast<TI>(v.raw)); + } + const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast<TI>(v.raw + bias); + if (rounded == 0) return Vec1<int32_t>(0); + // Round to even + if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) { + return Vec1<TI>(rounded - (is_sign ? -1 : 1)); + } + return Vec1<TI>(rounded); +} + +template <typename T> +HWY_API Vec1<T> Trunc(const Vec1<T> v) { + using TI = MakeSigned<T>; + if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN + return v; + } + const TI truncated = static_cast<TI>(v.raw); + if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v); + return Vec1<T>(static_cast<T>(truncated)); +} + +template <typename Float, typename Bits, int kMantissaBits, int kExponentBits, + class V> +V Ceiling(const V v) { + const Bits kExponentMask = (1ull << kExponentBits) - 1; + const Bits kMantissaMask = (1ull << kMantissaBits) - 1; + const Bits kBias = kExponentMask / 2; + + Float f = v.raw; + const bool positive = f > Float(0.0); + + Bits bits; + CopySameSize(&v, &bits); + + const int exponent = + static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) return v; + // |v| <= 1 => 0 or 1. + if (exponent < 0) return positive ? V(1) : V(-0.0); + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) return v; + + // Clear fractional bits and round up + if (positive) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &f); + return V(f); +} + +template <typename Float, typename Bits, int kMantissaBits, int kExponentBits, + class V> +V Floor(const V v) { + const Bits kExponentMask = (1ull << kExponentBits) - 1; + const Bits kMantissaMask = (1ull << kMantissaBits) - 1; + const Bits kBias = kExponentMask / 2; + + Float f = v.raw; + const bool negative = f < Float(0.0); + + Bits bits; + CopySameSize(&v, &bits); + + const int exponent = + static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) return v; + // |v| <= 1 => -1 or 0. + if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0)); + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) return v; + + // Clear fractional bits and round down + if (negative) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &f); + return V(f); +} + +// Toward +infinity, aka ceiling +HWY_API Vec1<float> Ceil(const Vec1<float> v) { + return Ceiling<float, uint32_t, 23, 8>(v); +} +HWY_API Vec1<double> Ceil(const Vec1<double> v) { + return Ceiling<double, uint64_t, 52, 11>(v); +} + +// Toward -infinity, aka floor +HWY_API Vec1<float> Floor(const Vec1<float> v) { + return Floor<float, uint32_t, 23, 8>(v); +} +HWY_API Vec1<double> Floor(const Vec1<double> v) { + return Floor<double, uint64_t, 52, 11>(v); +} + +// ================================================== COMPARE + +template <typename T> +HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) { + return Mask1<T>::FromBool(a.raw == b.raw); +} + +template <typename T> +HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) { + return Mask1<T>::FromBool(a.raw != b.raw); +} + +template <typename T> +HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +template <typename T> +HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) { + return Mask1<T>::FromBool(a.raw < b.raw); +} +template <typename T> +HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) { + return Mask1<T>::FromBool(a.raw > b.raw); +} + +template <typename T> +HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) { + return Mask1<T>::FromBool(a.raw <= b.raw); +} +template <typename T> +HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) { + return Mask1<T>::FromBool(a.raw >= b.raw); +} + +// ------------------------------ Floating-point classification (==) + +template <typename T> +HWY_API Mask1<T> IsNaN(const Vec1<T> v) { + // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. + MakeUnsigned<T> bits; + CopySameSize(&v, &bits); + bits += bits; + bits >>= 1; // clear sign bit + // NaN if all exponent bits are set and the mantissa is not zero. + return Mask1<T>::FromBool(bits > ExponentMask<T>()); +} + +HWY_API Mask1<float> IsInf(const Vec1<float> v) { + const Sisd<float> d; + const RebindToUnsigned<decltype(d)> du; + const Vec1<uint32_t> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u)); +} +HWY_API Mask1<double> IsInf(const Vec1<double> v) { + const Sisd<double> d; + const RebindToUnsigned<decltype(d)> du; + const Vec1<uint64_t> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull)); +} + +HWY_API Mask1<float> IsFinite(const Vec1<float> v) { + const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v); + // Shift left to clear the sign bit, check whether exponent != max value. + return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u); +} +HWY_API Mask1<double> IsFinite(const Vec1<double> v) { + const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v); + // Shift left to clear the sign bit, check whether exponent != max value. + return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { + T t; + CopySameSize(aligned, &t); + return Vec1<T>(t); +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> MaskedLoad(Mask1<T> m, D d, const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d, + const T* HWY_RESTRICT aligned) { + return IfThenElse(m, Load(d, aligned), v); +} + +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// In some use cases, "load single lane" is sufficient; otherwise avoid this. +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Vec1<T> LoadDup128(D d, const T* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +// ------------------------------ Store + +template <class D, typename T = TFromD<D>> +HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + CopySameSize(&v.raw, aligned); +} + +template <class D, typename T = TFromD<D>> +HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) { + return Store(v, d, p); +} + +template <class D, typename T = TFromD<D>> +HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, D d, T* HWY_RESTRICT p) { + if (!m.bits) return; + StoreU(v, d, p); +} + +// ------------------------------ LoadInterleaved2/3/4 + +// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, + Vec1<T>& v1) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); +} + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, + Vec1<T>& v1, Vec1<T>& v2) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); +} + +template <class D, typename T = TFromD<D>> +HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, + Vec1<T>& v1, Vec1<T>& v2, Vec1<T>& v3) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); + v3 = LoadU(d, unaligned + 3); +} + +// ------------------------------ StoreInterleaved2/3/4 + +template <class D, typename T = TFromD<D>> +HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, D d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); +} + +template <class D, typename T = TFromD<D>> +HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1, + const Vec1<T> v2, D d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); +} + +template <class D, typename T = TFromD<D>> +HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1, + const Vec1<T> v2, const Vec1<T> v3, D d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); + StoreU(v3, d, unaligned + 3); +} + +// ------------------------------ Stream + +template <class D, typename T = TFromD<D>> +HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) { + return Store(v, d, aligned); +} + +// ------------------------------ Scatter + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) { + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw; + return Store(v, d, reinterpret_cast<T*>(base8)); +} + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API void ScatterIndex(Vec1<T> v, D d, T* HWY_RESTRICT base, + Vec1<TI> index) { + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + return Store(v, d, base + index.raw); +} + +// ------------------------------ Gather + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<TI> offset) { + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + const intptr_t addr = + reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw); + return Load(d, reinterpret_cast<const T*>(addr)); +} + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base, Vec1<TI> index) { + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + return Load(d, base + index.raw); +} + +// ================================================== CONVERT + +// ConvertTo and DemoteTo with floating-point input and integer output truncate +// (rounding toward zero). + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> +HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) { + static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting"); + // For bits Y > X, floatX->floatY and intX->intY are always representable. + return Vec1<TTo>(static_cast<TTo>(from.raw)); +} + +// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, +// so we overload for TFrom=double and TTo={float,int32_t}. +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec1<float> DemoteTo(D /* tag */, Vec1<double> from) { + // Prevent ubsan errors when converting float to narrower integer/float + if (IsInf(from).bits || + Abs(from).raw > static_cast<double>(HighestValue<float>())) { + return Vec1<float>(detail::SignBit(from.raw) ? LowestValue<float>() + : HighestValue<float>()); + } + return Vec1<float>(static_cast<float>(from.raw)); +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec1<int32_t> DemoteTo(D /* tag */, Vec1<double> from) { + // Prevent ubsan errors when converting int32_t to narrower integer/int32_t + if (IsInf(from).bits || + Abs(from).raw > static_cast<double>(HighestValue<int32_t>())) { + return Vec1<int32_t>(detail::SignBit(from.raw) ? LowestValue<int32_t>() + : HighestValue<int32_t>()); + } + return Vec1<int32_t>(static_cast<int32_t>(from.raw)); +} + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, + HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)> +HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { + static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above"); + static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); + + // Int to int: choose closest value in TTo to `from` (avoids UB) + from.raw = HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw), LimitsMax<TTo>()); + return Vec1<TTo>(static_cast<TTo>(from.raw)); +} + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, + HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED_D(DTo)> +HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { + static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above"); + static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); + + // Int to int: choose closest value in TTo to `from` (avoids UB) + from.raw = HWY_MIN(from.raw, LimitsMax<TTo>()); + return Vec1<TTo>(static_cast<TTo>(from.raw)); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec1<float> PromoteTo(D /* tag */, const Vec1<float16_t> v) { + uint16_t bits16; + CopySameSize(&v.raw, &bits16); + const uint32_t sign = static_cast<uint32_t>(bits16 >> 15); + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = + (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024)); + return Vec1<float>(sign ? -subnormal : subnormal); + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + float out; + CopySameSize(&bits32, &out); + return Vec1<float>(out); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec1<float> PromoteTo(D d, const Vec1<bfloat16_t> v) { + return Set(d, F32FromBF16(v.raw)); +} + +template <class D, HWY_IF_F16_D(D)> +HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) { + uint32_t bits32; + CopySameSize(&v.raw, &bits32); + const uint32_t sign = bits32 >> 31; + const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; + const uint32_t mantissa32 = bits32 & 0x7FFFFF; + + const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15); + + // Tiny or zero => zero. + Vec1<float16_t> out; + if (exp < -24) { + const uint16_t zero = 0; + CopySameSize(&zero, &out.raw); + return out; + } + + uint32_t biased_exp16, mantissa16; + + // exp = [-24, -15] => subnormal + if (exp < -14) { + biased_exp16 = 0; + const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp); + HWY_DASSERT(1 <= sub_exp && sub_exp < 11); + mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) + + (mantissa32 >> (13 + sub_exp))); + } else { + // exp = [-14, 15] + biased_exp16 = static_cast<uint32_t>(exp + 15); + HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); + mantissa16 = mantissa32 >> 13; + } + + HWY_DASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + HWY_DASSERT(bits16 < 0x10000); + const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe + CopySameSize(&narrowed, &out.raw); + return out; +} + +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) { + return Set(d, BF16FromF32(v.raw)); +} + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, + HWY_IF_FLOAT(TFrom)> +HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) { + static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); + // float## -> int##: return closest representable value. We cannot exactly + // represent LimitsMax<TTo> in TFrom, so use double. + const double f = static_cast<double>(from.raw); + if (IsInf(from).bits || + Abs(Vec1<double>(f)).raw > static_cast<double>(LimitsMax<TTo>())) { + return Vec1<TTo>(detail::SignBit(from.raw) ? LimitsMin<TTo>() + : LimitsMax<TTo>()); + } + return Vec1<TTo>(static_cast<TTo>(from.raw)); +} + +template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, + HWY_IF_NOT_FLOAT(TFrom)> +HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) { + static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); + // int## -> float##: no check needed + return Vec1<TTo>(static_cast<TTo>(from.raw)); +} + +HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) { + return DemoteTo(Sisd<uint8_t>(), v); +} + +// ------------------------------ TruncateTo + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { + return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { + return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec1<uint32_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { + return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) { + return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) { + return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint16_t> v) { + return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; +} + +// ================================================== COMBINE +// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported. + +template <typename T> +HWY_API Vec1<T> LowerHalf(Vec1<T> v) { + return v; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> LowerHalf(D /* tag */, Vec1<T> v) { + return v; +} + +// ================================================== SWIZZLE + +template <typename T> +HWY_API T GetLane(const Vec1<T> v) { + return v.raw; +} + +template <typename T> +HWY_API T ExtractLane(const Vec1<T> v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return v.raw; +} + +template <typename T> +HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + v.raw = t; + return v; +} + +template <typename T> +HWY_API Vec1<T> DupEven(Vec1<T> v) { + return v; +} +// DupOdd is unsupported. + +template <typename T> +HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) { + return even; +} + +template <typename T> +HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template <typename T> +HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) { + return v; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template <typename T> +struct Indices1 { + MakeSigned<T> raw; +}; + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API Indices1<T> IndicesFromVec(D, Vec1<TI> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); + HWY_DASSERT(vec.raw <= 1); + return Indices1<T>{static_cast<MakeSigned<T>>(vec.raw)}; +} + +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename TI> +HWY_API Indices1<T> SetTableIndices(D d, const TI* idx) { + return IndicesFromVec(d, LoadU(Sisd<TI>(), idx)); +} + +template <typename T> +HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) { + return v; +} + +template <typename T> +HWY_API Vec1<T> TwoTablesLookupLanes(const Vec1<T> a, const Vec1<T> b, + const Indices1<T> idx) { + return (idx.raw == 0) ? a : b; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> ReverseBlocks(D /* tag */, const Vec1<T> v) { + return v; +} + +// ------------------------------ Reverse + +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> Reverse(D /* tag */, const Vec1<T> v) { + return v; +} + +// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. +#ifdef HWY_NATIVE_REVERSE2_8 +#undef HWY_NATIVE_REVERSE2_8 +#else +#define HWY_NATIVE_REVERSE2_8 +#endif + +// Must not be called: +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> Reverse2(D /* tag */, const Vec1<T> v) { + return v; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> Reverse4(D /* tag */, const Vec1<T> v) { + return v; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> Reverse8(D /* tag */, const Vec1<T> v) { + return v; +} + +// ------------------------------ ReverseLaneBytes + +#ifdef HWY_NATIVE_REVERSE_LANE_BYTES +#undef HWY_NATIVE_REVERSE_LANE_BYTES +#else +#define HWY_NATIVE_REVERSE_LANE_BYTES +#endif + +HWY_API Vec1<uint16_t> ReverseLaneBytes(Vec1<uint16_t> v) { + const uint32_t val{v.raw}; + return Vec1<uint16_t>( + static_cast<uint16_t>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu))); +} + +HWY_API Vec1<uint32_t> ReverseLaneBytes(Vec1<uint32_t> v) { + const uint32_t val = v.raw; + return Vec1<uint32_t>(static_cast<uint32_t>( + ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) | + ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu))); +} + +HWY_API Vec1<uint64_t> ReverseLaneBytes(Vec1<uint64_t> v) { + const uint64_t val = v.raw; + return Vec1<uint64_t>(static_cast<uint64_t>( + ((val << 56) & 0xFF00000000000000u) | + ((val << 40) & 0x00FF000000000000u) | + ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) | + ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) | + ((val >> 40) & 0x000000000000FF00u) | + ((val >> 56) & 0x00000000000000FFu))); +} + +template <class V, HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API V ReverseLaneBytes(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, ReverseLaneBytes(BitCast(du, v))); +} + +// ------------------------------ ReverseBits +#ifdef HWY_NATIVE_REVERSE_BITS_UI8 +#undef HWY_NATIVE_REVERSE_BITS_UI8 +#else +#define HWY_NATIVE_REVERSE_BITS_UI8 +#endif + +#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#else +#define HWY_NATIVE_REVERSE_BITS_UI16_32_64 +#endif + +namespace detail { + +template <class T> +HWY_INLINE T ReverseBitsOfEachByte(T val) { + using TU = MakeUnsigned<T>; + constexpr TU kMaxUnsignedVal{LimitsMax<TU>()}; + constexpr TU kShrMask1 = + static_cast<TU>(0x5555555555555555u & kMaxUnsignedVal); + constexpr TU kShrMask2 = + static_cast<TU>(0x3333333333333333u & kMaxUnsignedVal); + constexpr TU kShrMask3 = + static_cast<TU>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal); + + constexpr TU kShlMask1 = static_cast<TU>(~kShrMask1); + constexpr TU kShlMask2 = static_cast<TU>(~kShrMask2); + constexpr TU kShlMask3 = static_cast<TU>(~kShrMask3); + + TU result = static_cast<TU>(val); + result = static_cast<TU>(((result << 1) & kShlMask1) | + ((result >> 1) & kShrMask1)); + result = static_cast<TU>(((result << 2) & kShlMask2) | + ((result >> 2) & kShrMask2)); + result = static_cast<TU>(((result << 4) & kShlMask3) | + ((result >> 4) & kShrMask3)); + return static_cast<T>(result); +} + +} // namespace detail + +template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)> +HWY_API V ReverseBits(V v) { + return V(detail::ReverseBitsOfEachByte(v.raw)); +} + +template <class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API V ReverseBits(V v) { + return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw))); +} + +template <class V, HWY_IF_SIGNED_V(V)> +HWY_API V ReverseBits(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, ReverseBits(BitCast(du, v))); +} + +// ================================================== BLOCKWISE +// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported. + +// ------------------------------ Broadcast/splat any lane + +template <int kLane, typename T> +HWY_API Vec1<T> Broadcast(const Vec1<T> v) { + static_assert(kLane == 0, "Scalar only has one lane"); + return v; +} + +// ------------------------------ TableLookupBytes, TableLookupBytesOr0 + +template <typename T, typename TI> +HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) { + uint8_t in_bytes[sizeof(T)]; + uint8_t idx_bytes[sizeof(T)]; + uint8_t out_bytes[sizeof(T)]; + CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes + CopyBytes<sizeof(T)>(&indices, &idx_bytes); + for (size_t i = 0; i < sizeof(T); ++i) { + out_bytes[i] = in_bytes[idx_bytes[i]]; + } + TI out; + CopyBytes<sizeof(TI)>(&out_bytes, &out); + return Vec1<TI>{out}; +} + +template <typename T, typename TI> +HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) { + uint8_t in_bytes[sizeof(T)]; + uint8_t idx_bytes[sizeof(T)]; + uint8_t out_bytes[sizeof(T)]; + CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes + CopyBytes<sizeof(T)>(&indices, &idx_bytes); + for (size_t i = 0; i < sizeof(T); ++i) { + out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]]; + } + TI out; + CopyBytes<sizeof(TI)>(&out_bytes, &out); + return Vec1<TI>{out}; +} + +// ------------------------------ ZipLower + +HWY_API Vec1<uint16_t> ZipLower(Vec1<uint8_t> a, Vec1<uint8_t> b) { + return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t{b.raw} << 8) + a.raw)); +} +HWY_API Vec1<uint32_t> ZipLower(Vec1<uint16_t> a, Vec1<uint16_t> b) { + return Vec1<uint32_t>((uint32_t{b.raw} << 16) + a.raw); +} +HWY_API Vec1<uint64_t> ZipLower(Vec1<uint32_t> a, Vec1<uint32_t> b) { + return Vec1<uint64_t>((uint64_t{b.raw} << 32) + a.raw); +} +HWY_API Vec1<int16_t> ZipLower(Vec1<int8_t> a, Vec1<int8_t> b) { + return Vec1<int16_t>(static_cast<int16_t>((int32_t{b.raw} << 8) + a.raw)); +} +HWY_API Vec1<int32_t> ZipLower(Vec1<int16_t> a, Vec1<int16_t> b) { + return Vec1<int32_t>((int32_t{b.raw} << 16) + a.raw); +} +HWY_API Vec1<int64_t> ZipLower(Vec1<int32_t> a, Vec1<int32_t> b) { + return Vec1<int64_t>((int64_t{b.raw} << 32) + a.raw); +} + +template <class DW, typename TW = TFromD<DW>, typename TN = MakeNarrow<TW>> +HWY_API Vec1<TW> ZipLower(DW /* tag */, Vec1<TN> a, Vec1<TN> b) { + return Vec1<TW>(static_cast<TW>((TW{b.raw} << (sizeof(TN) * 8)) + a.raw)); +} + +// ================================================== MASK + +template <class D, typename T = TFromD<D>> +HWY_API bool AllFalse(D /* tag */, const Mask1<T> mask) { + return mask.bits == 0; +} + +template <class D, typename T = TFromD<D>> +HWY_API bool AllTrue(D /* tag */, const Mask1<T> mask) { + return mask.bits != 0; +} + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> +HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { + return Mask1<T>::FromBool((bits[0] & 1) != 0); +} + +// `p` points to at least 8 writable bytes. +template <class D, typename T = TFromD<D>> +HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) { + *bits = AllTrue(d, mask); + return 1; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t CountTrue(D /* tag */, const Mask1<T> mask) { + return mask.bits == 0 ? 0 : 1; +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1<T> mask) { + return mask.bits == 0 ? -1 : 0; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1<T> /* m */) { + return 0; // There is only one lane and we know it is true. +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1<T> mask) { + return mask.bits == 0 ? -1 : 0; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1<T> /* m */) { + return 0; // There is only one lane and we know it is true. +} + +// ------------------------------ Compress, CompressBits + +template <typename T> +struct CompressIsPartition { + enum { value = 1 }; +}; + +template <typename T> +HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) { + // A single lane is already partitioned by definition. + return v; +} + +template <typename T> +HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) { + // A single lane is already partitioned by definition. + return v; +} + +// ------------------------------ CompressStore +template <class D, typename T = TFromD<D>> +HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, D d, + T* HWY_RESTRICT unaligned) { + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ CompressBlendedStore +template <class D, typename T = TFromD<D>> +HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, D d, + T* HWY_RESTRICT unaligned) { + if (!mask.bits) return 0; + StoreU(v, d, unaligned); + return 1; +} + +// ------------------------------ CompressBits +template <typename T> +HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) { + return v; +} + +// ------------------------------ CompressBitsStore +template <class D, typename T = TFromD<D>> +HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits, + D d, T* HWY_RESTRICT unaligned) { + const Mask1<T> mask = LoadMaskBits(d, bits); + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ Expand + +// generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here. +#ifdef HWY_NATIVE_EXPAND +#undef HWY_NATIVE_EXPAND +#else +#define HWY_NATIVE_EXPAND +#endif + +template <typename T> +HWY_API Vec1<T> Expand(Vec1<T> v, const Mask1<T> mask) { + return IfThenElseZero(mask, v); +} + +// ------------------------------ LoadExpand +template <class D> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + return MaskedLoad(mask, d, unaligned); +} + +// ------------------------------ WidenMulPairwiseAdd + +template <class D32, HWY_IF_F32_D(D32)> +HWY_API Vec1<float> WidenMulPairwiseAdd(D32 /* tag */, Vec1<bfloat16_t> a, + Vec1<bfloat16_t> b) { + return Vec1<float>(F32FromBF16(a.raw)) * + Vec1<float>(F32FromBF16(b.raw)); +} + +template <class D32, HWY_IF_I32_D(D32)> +HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a, + Vec1<int16_t> b) { + return Vec1<int32_t>(a.raw * b.raw); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template <class D32, HWY_IF_F32_D(D32)> +HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a, + Vec1<bfloat16_t> b, + const Vec1<float> sum0, + Vec1<float>& /* sum1 */) { + return MulAdd(Vec1<float>(F32FromBF16(a.raw)), + Vec1<float>(F32FromBF16(b.raw)), sum0); +} + +template <class D32, HWY_IF_I32_D(D32)> +HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<int16_t> a, + Vec1<int16_t> b, + const Vec1<int32_t> sum0, + Vec1<int32_t>& /* sum1 */) { + return Vec1<int32_t>(a.raw * b.raw + sum0.raw); +} + +// ------------------------------ RearrangeToOddPlusEven +template <typename TW> +HWY_API Vec1<TW> RearrangeToOddPlusEven(Vec1<TW> sum0, Vec1<TW> /* sum1 */) { + return sum0; // invariant already holds +} + +// ================================================== REDUCTIONS + +// Sum of all lanes, i.e. the only one. +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> SumOfLanes(D /* tag */, const Vec1<T> v) { + return v; +} +template <class D, typename T = TFromD<D>> +HWY_API T ReduceSum(D /* tag */, const Vec1<T> v) { + return GetLane(v); +} +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> MinOfLanes(D /* tag */, const Vec1<T> v) { + return v; +} +template <class D, typename T = TFromD<D>> +HWY_API Vec1<T> MaxOfLanes(D /* tag */, const Vec1<T> v) { + return v; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/set_macros-inl.h b/third_party/highway/hwy/ops/set_macros-inl.h new file mode 100644 index 0000000000..ee1ebabdf2 --- /dev/null +++ b/third_party/highway/hwy/ops/set_macros-inl.h @@ -0,0 +1,566 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Sets macros based on HWY_TARGET. + +// This include guard is toggled by foreach_target, so avoid the usual _H_ +// suffix to prevent copybara from renaming it. +#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE) +#ifdef HWY_SET_MACROS_PER_TARGET +#undef HWY_SET_MACROS_PER_TARGET +#else +#define HWY_SET_MACROS_PER_TARGET +#endif + +#endif // HWY_SET_MACROS_PER_TARGET + +#include "hwy/detect_compiler_arch.h" // IWYU: export +#include "hwy/detect_targets.h" // IWYU: export + +#undef HWY_NAMESPACE +#undef HWY_ALIGN +#undef HWY_MAX_BYTES +#undef HWY_LANES + +#undef HWY_HAVE_SCALABLE +#undef HWY_HAVE_TUPLE +#undef HWY_HAVE_INTEGER64 +#undef HWY_HAVE_FLOAT16 +#undef HWY_HAVE_FLOAT64 +#undef HWY_MEM_OPS_MIGHT_FAULT +#undef HWY_NATIVE_FMA +#undef HWY_CAP_GE256 +#undef HWY_CAP_GE512 + +// Supported on all targets except RVV (requires GCC 14 or upcoming Clang) +#if HWY_TARGET == HWY_RVV && \ + (HWY_COMPILER_GCC_ACTUAL < 1400 || HWY_COMPILER_CLANG) +#define HWY_HAVE_TUPLE 0 +#else +#define HWY_HAVE_TUPLE 1 +#endif + +// For internal use (clamping/validating N for Simd<>) +#undef HWY_MAX_N +#if HWY_TARGET == HWY_SCALAR +#define HWY_MAX_N 1 +#else +#define HWY_MAX_N 65536 +#endif + +// For internal use (clamping kPow2 for Simd<>) +#undef HWY_MAX_POW2 +// For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to +// support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3. +// However, those other targets do not actually support multiple vectors, and +// thus Lanes(d) must not exceed Lanes(ScalableTag<T>()). +#define HWY_MAX_POW2 3 + +// User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >> +// (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions. +#undef HWY_MIN_POW2 +#if HWY_TARGET == HWY_RVV +#define HWY_MIN_POW2 -16 +#else +// Tighter bound for other targets, whose vectors are smaller, to potentially +// save compile time. +#define HWY_MIN_POW2 -8 +#endif // HWY_TARGET == HWY_RVV + +#undef HWY_TARGET_STR + +#if defined(HWY_DISABLE_PCLMUL_AES) +#define HWY_TARGET_STR_PCLMUL_AES "" +#else +#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes" +#endif + +#if defined(HWY_DISABLE_BMI2_FMA) +#define HWY_TARGET_STR_BMI2_FMA "" +#else +#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma" +#endif + +#if defined(HWY_DISABLE_F16C) +#define HWY_TARGET_STR_F16C "" +#else +#define HWY_TARGET_STR_F16C ",f16c" +#endif + +#define HWY_TARGET_STR_SSE2 "sse2" + +#define HWY_TARGET_STR_SSSE3 "sse2,ssse3" + +#define HWY_TARGET_STR_SSE4 \ + HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES +// Include previous targets, which are the half-vectors of the next target. +#define HWY_TARGET_STR_AVX2 \ + HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C +#define HWY_TARGET_STR_AVX3 \ + HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" +#define HWY_TARGET_STR_AVX3_DL \ + HWY_TARGET_STR_AVX3 \ + ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \ + "avx512vpopcntdq,gfni" + +#if defined(HWY_DISABLE_PPC8_CRYPTO) +#define HWY_TARGET_STR_PPC8_CRYPTO "" +#else +#define HWY_TARGET_STR_PPC8_CRYPTO ",crypto" +#endif + +#define HWY_TARGET_STR_PPC8 \ + "altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO +#define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector" + +#if HWY_COMPILER_CLANG +#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector" +#else +#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10" +#endif + +// Before include guard so we redefine HWY_TARGET_STR on each include, +// governed by the current HWY_TARGET. + +//----------------------------------------------------------------------------- +// SSE2 +#if HWY_TARGET == HWY_SSE2 + +#define HWY_NAMESPACE N_SSE2 +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_SSE2 +//----------------------------------------------------------------------------- +// SSSE3 +#elif HWY_TARGET == HWY_SSSE3 + +#define HWY_NAMESPACE N_SSSE3 +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3 + +//----------------------------------------------------------------------------- +// SSE4 +#elif HWY_TARGET == HWY_SSE4 + +#define HWY_NAMESPACE N_SSE4 +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_SSE4 + +//----------------------------------------------------------------------------- +// AVX2 +#elif HWY_TARGET == HWY_AVX2 + +#define HWY_NAMESPACE N_AVX2 +#define HWY_ALIGN alignas(32) +#define HWY_MAX_BYTES 32 +#define HWY_LANES(T) (32 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 + +#ifdef HWY_DISABLE_BMI2_FMA +#define HWY_NATIVE_FMA 0 +#else +#define HWY_NATIVE_FMA 1 +#endif + +#define HWY_CAP_GE256 1 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_AVX2 + +//----------------------------------------------------------------------------- +// AVX3[_DL] +#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ + HWY_TARGET == HWY_AVX3_ZEN4 + +#define HWY_ALIGN alignas(64) +#define HWY_MAX_BYTES 64 +#define HWY_LANES(T) (64 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 1 +#define HWY_CAP_GE512 1 + +#if HWY_TARGET == HWY_AVX3 + +#define HWY_NAMESPACE N_AVX3 +#define HWY_TARGET_STR HWY_TARGET_STR_AVX3 + +#elif HWY_TARGET == HWY_AVX3_DL + +#define HWY_NAMESPACE N_AVX3_DL +#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL + +#elif HWY_TARGET == HWY_AVX3_ZEN4 + +#define HWY_NAMESPACE N_AVX3_ZEN4 +// Currently the same as HWY_AVX3_DL: both support Icelake. +#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL + +#else +#error "Logic error" +#endif // HWY_TARGET == HWY_AVX3_ZEN4 + +//----------------------------------------------------------------------------- +// PPC8, PPC9, PPC10 +#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \ + HWY_TARGET == HWY_PPC10 + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if HWY_TARGET == HWY_PPC8 + +#define HWY_NAMESPACE N_PPC8 +#define HWY_TARGET_STR HWY_TARGET_STR_PPC8 + +#elif HWY_TARGET == HWY_PPC9 + +#define HWY_NAMESPACE N_PPC9 +#define HWY_TARGET_STR HWY_TARGET_STR_PPC9 + +#elif HWY_TARGET == HWY_PPC10 + +#define HWY_NAMESPACE N_PPC10 +#define HWY_TARGET_STR HWY_TARGET_STR_PPC10 + +#else +#error "Logic error" +#endif // HWY_TARGET == HWY_PPC10 + +//----------------------------------------------------------------------------- +// NEON +#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 + +#if HWY_ARCH_ARM_A64 +#define HWY_HAVE_FLOAT64 1 +#else +#define HWY_HAVE_FLOAT64 0 +#endif + +#define HWY_MEM_OPS_MIGHT_FAULT 1 + +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 +#define HWY_NATIVE_FMA 1 +#else +#define HWY_NATIVE_FMA 0 +#endif + +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if HWY_TARGET == HWY_NEON_WITHOUT_AES +#define HWY_NAMESPACE N_NEON_WITHOUT_AES +#else +#define HWY_NAMESPACE N_NEON +#endif + +// Can use pragmas instead of -march compiler flag +#if HWY_HAVE_RUNTIME_DISPATCH +#if HWY_ARCH_ARM_V7 + +// The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8. +#if HWY_COMPILER_GCC_ACTUAL >= 800 +#define HWY_TARGET_STR "+neon-vfpv4" +#else // GCC < 7 +// Do not define HWY_TARGET_STR (no pragma). +#endif // HWY_COMPILER_GCC_ACTUAL + +#else // !HWY_ARCH_ARM_V7 + +#if HWY_TARGET == HWY_NEON_WITHOUT_AES +// Do not define HWY_TARGET_STR (no pragma). +#else +#if HWY_COMPILER_GCC_ACTUAL +#define HWY_TARGET_STR "arch=armv8-a+crypto" +#else // clang +#define HWY_TARGET_STR "+crypto" +#endif // HWY_COMPILER_* +#endif // HWY_TARGET == HWY_NEON_WITHOUT_AES + +#endif // HWY_ARCH_ARM_V7 +#else // !HWY_HAVE_RUNTIME_DISPATCH +// HWY_TARGET_STR remains undefined +#endif + +//----------------------------------------------------------------------------- +// SVE[2] +#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \ + HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 + +// SVE only requires lane alignment, not natural alignment of the entire vector. +#define HWY_ALIGN alignas(8) + +// Value ensures MaxLanes() is the tightest possible upper bound to reduce +// overallocation. +#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T)) + +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if HWY_TARGET == HWY_SVE2 +#define HWY_NAMESPACE N_SVE2 +#define HWY_MAX_BYTES 256 +#define HWY_HAVE_SCALABLE 1 +#elif HWY_TARGET == HWY_SVE_256 +#define HWY_NAMESPACE N_SVE_256 +#define HWY_MAX_BYTES 32 +#define HWY_HAVE_SCALABLE 0 +#elif HWY_TARGET == HWY_SVE2_128 +#define HWY_NAMESPACE N_SVE2_128 +#define HWY_MAX_BYTES 16 +#define HWY_HAVE_SCALABLE 0 +#else +#define HWY_NAMESPACE N_SVE +#define HWY_MAX_BYTES 256 +#define HWY_HAVE_SCALABLE 1 +#endif + +// Can use pragmas instead of -march compiler flag +#if HWY_HAVE_RUNTIME_DISPATCH +#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 +#define HWY_TARGET_STR "+sve2-aes" +#else +#define HWY_TARGET_STR "+sve" +#endif +#else +// HWY_TARGET_STR remains undefined +#endif + +//----------------------------------------------------------------------------- +// WASM +#elif HWY_TARGET == HWY_WASM + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 0 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_WASM + +#define HWY_TARGET_STR "simd128" + +//----------------------------------------------------------------------------- +// WASM_EMU256 +#elif HWY_TARGET == HWY_WASM_EMU256 + +#define HWY_ALIGN alignas(32) +#define HWY_MAX_BYTES 32 +#define HWY_LANES(T) (32 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 0 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 1 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_WASM_EMU256 + +#define HWY_TARGET_STR "simd128" + +//----------------------------------------------------------------------------- +// RVV +#elif HWY_TARGET == HWY_RVV + +// RVV only requires lane alignment, not natural alignment of the entire vector, +// and the compiler already aligns builtin types, so nothing to do here. +#define HWY_ALIGN + +// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8). +#define HWY_MAX_BYTES 65536 + +// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual +// LMUL. This is the tightest possible upper bound. +#define HWY_LANES(T) (8192 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 1 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if defined(__riscv_zvfh) +#define HWY_HAVE_FLOAT16 1 +#else +#define HWY_HAVE_FLOAT16 0 +#endif + +#define HWY_NAMESPACE N_RVV + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. +// (rv64gcv is not a valid target) + +//----------------------------------------------------------------------------- +// EMU128 +#elif HWY_TARGET == HWY_EMU128 + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_EMU128 + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. + +//----------------------------------------------------------------------------- +// SCALAR +#elif HWY_TARGET == HWY_SCALAR + +#define HWY_ALIGN +#define HWY_MAX_BYTES 8 +#define HWY_LANES(T) 1 + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_SCALAR + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. + +#else +#pragma message("HWY_TARGET does not match any known target") +#endif // HWY_TARGET + +// Override this to 1 in asan/msan builds, which will still fault. +#if HWY_IS_ASAN || HWY_IS_MSAN +#undef HWY_MEM_OPS_MIGHT_FAULT +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#endif + +// Clang <9 requires this be invoked at file scope, before any namespace. +#undef HWY_BEFORE_NAMESPACE +#if defined(HWY_TARGET_STR) +#define HWY_BEFORE_NAMESPACE() \ + HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \ + static_assert(true, "For requiring trailing semicolon") +#else +// avoids compiler warning if no HWY_TARGET_STR +#define HWY_BEFORE_NAMESPACE() \ + static_assert(true, "For requiring trailing semicolon") +#endif + +// Clang <9 requires any namespaces be closed before this macro. +#undef HWY_AFTER_NAMESPACE +#if defined(HWY_TARGET_STR) +#define HWY_AFTER_NAMESPACE() \ + HWY_POP_ATTRIBUTES \ + static_assert(true, "For requiring trailing semicolon") +#else +// avoids compiler warning if no HWY_TARGET_STR +#define HWY_AFTER_NAMESPACE() \ + static_assert(true, "For requiring trailing semicolon") +#endif + +#undef HWY_ATTR +#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target) +#define HWY_ATTR __attribute__((target(HWY_TARGET_STR))) +#else +#define HWY_ATTR +#endif diff --git a/third_party/highway/hwy/ops/shared-inl.h b/third_party/highway/hwy/ops/shared-inl.h new file mode 100644 index 0000000000..003408ff01 --- /dev/null +++ b/third_party/highway/hwy/ops/shared-inl.h @@ -0,0 +1,488 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target definitions shared by ops/*.h and user code. + +// IWYU pragma: begin_exports +// Export does not seem to be recursive, so re-export these (also in base.h) +#include <stddef.h> + +#include "hwy/base.h" +// "IWYU pragma: keep" does not work for this include, so hide it from the IDE. +#if !HWY_IDE +#include <stdint.h> +#endif + +#include "hwy/detect_compiler_arch.h" + +// Separate header because foreach_target.h re-enables its include guard. +#include "hwy/ops/set_macros-inl.h" + +// IWYU pragma: end_exports + +#if HWY_IS_MSAN +#include <sanitizer/msan_interface.h> +#endif + +// We are covered by the highway.h include guard, but generic_ops-inl.h +// includes this again #if HWY_IDE. +#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE +#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE +#else +#define HIGHWAY_HWY_OPS_SHARED_TOGGLE +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// NOTE: GCC generates incorrect code for vector arguments to non-inlined +// functions in two situations: +// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads: +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412. +// - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not +// all) tests to fail. +// +// We therefore pass by const& only on GCC and (Windows or aarch64). This alias +// must be used for all vector/mask parameters of functions marked HWY_NOINLINE, +// and possibly also other functions that are not inlined. +#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64) +template <class V> +using VecArg = const V&; +#else +template <class V> +using VecArg = V; +#endif + +namespace detail { + +// Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the +// desired fraction or multiple of it, see Simd<>. `pow2` is most often in +// [-3, 3] but can also be lower for user-specified fractions. +constexpr size_t ScaleByPower(size_t N, int pow2) { + return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); +} + +template <typename T> +HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) { + // Workaround for MSAN not marking compressstore as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#else + (void)unaligned; + (void)count; +#endif +} + +} // namespace detail + +// Highway operations are implemented as overloaded functions selected using a +// zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type. +// +// N defines how many lanes are in a 'full' vector, typically equal to +// HWY_LANES(T) (which is the actual count on targets with vectors of known +// size, and an upper bound in case of scalable vectors), otherwise a +// user-specified limit at most that large. +// +// 2^kPow2 is a _subsequently_ applied scaling factor that indicates the +// desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3 +// means two/four/eight full vectors ganged together. The largest supported +// kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping +// user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>` +// have the same `MaxLanes` and `Lanes`. +// +// We can theoretically keep halving Lanes(), but recursive instantiations of +// kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count. +// Users must terminate such compile-time recursions at or above HWY_MIN_POW2. +// +// WARNING: do not use N directly because it may be a special representation of +// a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to +// Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two, +// but we want MaxLanes to be the same in both cases. Hence ?? is a +// fixed-point encoding of 1/4. +// +// Instead of referring to Simd<> directly, users create D via aliases: +// - ScalableTag<T> for a full vector; +// - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is +// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`; +// - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or +// - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes. +// +// Instead of N, use Lanes(D()) for the actual number of lanes at runtime and +// D().MaxLanes() for a constexpr upper bound. Both are powers of two. +template <typename Lane, size_t N, int kPow2> +struct Simd { + constexpr Simd() = default; + using T = Lane; + + private: + static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit"); + // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of + // N when kFrac == 0, otherwise it is one (see FracN). + static constexpr size_t kWhole = N & 0xFFFFF; + // Fractional part is in the bits above kWhole. + static constexpr int kFrac = static_cast<int>(N >> 20); + // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger + // type to u8 results in fractions). + static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range"); + static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1"); + static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x"); + // Important to check this here because kPow2 <= -64 causes confusing + // compile errors (invalid shift count). + static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?"); + // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to + // Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its + // kPow2 is out of bounds. + + public: + // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the + // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2. + // E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>. + // The resulting number of lanes is still 1 because this N represents 1/4 + // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of + // the sizes so that the correct LMUL overloads are chosen, even if N is + // small enough that it would fit in an LMUL=1 vector. + // + // Cannot be an enum because GCC warns when using enums and non-enums in the + // same expression. Cannot be a static constexpr function (MSVC limitation). + // Rounded up to one so this is a valid array length. + // + // Do not use this directly - only 'public' so it is visible from the accessor + // macro required by MSVC. + static constexpr size_t kPrivateLanes = + HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac)); + + constexpr size_t MaxLanes() const { return kPrivateLanes; } + constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); } + // For SFINAE on RVV. + constexpr int Pow2() const { return kPow2; } + + // ------------------------------ Changing lane type or count + // Do not use any of these directly. Anything used from member typedefs cannot + // be made private, but functions only used within other functions can. + + // Returns number of NewT lanes that fit within MaxBytes(). + template <typename NewT> + static constexpr size_t RepartitionLanes() { + // Round up to correctly handle larger NewT. + return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); + } + + // Returns the new kPow2 required for lanes of type NewT. + template <typename NewT> + static constexpr int RebindPow2() { + return kPow2 + + ((sizeof(NewT) >= sizeof(T)) + ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T))) + : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)))); + } + + private: + // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2. + template <int kNewPow2, size_t kNewMaxLanes> + static constexpr size_t WholeN() { + return detail::ScaleByPower(kNewMaxLanes, -kNewPow2); + } + + // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2. + template <int kNewPow2, size_t kNewMaxLanes> + static constexpr size_t FracN() { + // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN + // would not have been zero), but clamp to zero to avoid warnings. kFrac is + // the difference, stored in the upper bits of N, and we also set kWhole = + // 1 so that the new kPrivateLanes = kNewMaxLanes. + static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift"); + return static_cast<size_t>( + 1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes))) + << 20)); + } + + public: + // Returns (whole or fractional) NewN, see above. + template <int kNewPow2, size_t kNewMaxLanes> + static constexpr size_t NewN() { + // We require a fraction if inverting kNewPow2 results in 0. + return WholeN<kNewPow2, kNewMaxLanes>() == 0 + ? FracN<kNewPow2, kNewMaxLanes>() + : WholeN<kNewPow2, kNewMaxLanes>(); + } + + // PromoteTo/DemoteTo() with another lane type, but same number of lanes. + template <typename NewT> + using Rebind = + Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>; + + // Change lane type while keeping the same vector size, e.g. for MulEven. + template <typename NewT> + using Repartition = + Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>; + + // Half the lanes while keeping the same lane type, e.g. for LowerHalf. + using Half = Simd<T, N, kPow2 - 1>; + + // Twice the lanes while keeping the same lane type, e.g. for Combine. + using Twice = Simd<T, N, kPow2 + 1>; +}; + +namespace detail { + +template <typename T, size_t N, int kPow2> +constexpr bool IsFull(Simd<T, N, kPow2> /* d */) { + return N == HWY_LANES(T) && kPow2 == 0; +} + +// Struct wrappers enable validation of arguments via static_assert. +template <typename T, size_t N, int kPow2> +struct ClampNAndPow2 { + using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>; +}; + +template <typename T, int kPow2> +struct ScalableTagChecker { + using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type; +}; + +template <typename T, size_t kLimit, int kPow2> +struct CappedTagChecker { + static_assert(kLimit != 0, "Does not make sense to have zero lanes"); + // Safely handle non-power-of-two inputs by rounding down, which is allowed by + // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert. + static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit); + static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T)); + using type = typename ClampNAndPow2<T, N, kPow2>::type; +}; + +template <typename T, size_t kNumLanes> +struct FixedTagChecker { + static_assert(kNumLanes != 0, "Does not make sense to have zero lanes"); + static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes"); + using type = Simd<T, kNumLanes, 0>; +}; + +} // namespace detail + +// ------------------------------ Aliases for Simd<> + +// Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D +// loops where the application does not care about the vector size) or a +// fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or +// return values of type promotion and demotion. User-specified kPow2 is +// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. +template <typename T, int kPow2 = 0> +using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type; + +// Tag describing a vector with *up to* kLimit active lanes, even on targets +// with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may +// be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for +// 1D loops with a relatively low application-defined upper bound, e.g. for 8x8 +// DCTs. However, it is better if data structures are designed to be +// vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >= +// MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would +// enable vector-length-agnostic loops using ScalableTag). User-specified kPow2 +// is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. +template <typename T, size_t kLimit, int kPow2 = 0> +using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type; + +#if !HWY_HAVE_SCALABLE +// If the vector size is known, and the app knows it does not want more than +// kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower +// IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2. +template <typename T, size_t kLimit, int kPow2 = 0> +using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>; +#else // HWY_HAVE_SCALABLE +// .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit. +template <typename T, size_t kLimit, int kPow2 = 0> +using CappedTagIfFixed = ScalableTag<T, kPow2>; +#endif + +// Alias for a tag describing a vector with *exactly* kNumLanes active lanes, +// even on targets with scalable vectors. Requires `kNumLanes` to be a power of +// two not exceeding `HWY_LANES(T)`. +// +// NOTE: if the application does not need to support HWY_SCALAR (+), use this +// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes. +// This is useful for data structures that rely on exactly 128-bit SIMD, but +// these are discouraged because they cannot benefit from wider vectors. +// Instead, applications would ideally define a larger problem size and loop +// over it with the (unknown size) vectors from ScalableTag. +// +// + e.g. if the baseline is known to support SIMD, or the application requires +// ops such as TableLookupBytes not supported by HWY_SCALAR. +template <typename T, size_t kNumLanes> +using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type; + +// Convenience form for fixed sizes. +template <typename T> +using Full16 = Simd<T, 2 / sizeof(T), 0>; + +template <typename T> +using Full32 = Simd<T, 4 / sizeof(T), 0>; + +template <typename T> +using Full64 = Simd<T, 8 / sizeof(T), 0>; + +template <typename T> +using Full128 = Simd<T, 16 / sizeof(T), 0>; + +// ------------------------------ Accessors for Simd<> + +// Lane type. +template <class D> +using TFromD = typename D::T; + +// Upper bound on the number of lanes, typically used for SFINAE conditions and +// to allocate storage for targets with known vector sizes. Note: this may be a +// loose bound, instead use Lanes() as the actual size for AllocateAligned. +// MSVC workaround: use static constant directly instead of a function. +#define HWY_MAX_LANES_D(D) D::kPrivateLanes + +// Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the +// macro form may be required for MSVC, which has limitations on deducing +// arguments. +template <class D> +HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { + return HWY_MAX_LANES_D(D); +} + +#if !HWY_HAVE_SCALABLE + +// If non-scalable, this is constexpr; otherwise the target's header defines a +// non-constexpr version of this function. This is the actual vector length, +// used when advancing loop counters. +template <class D> +HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) { + return HWY_MAX_LANES_D(D); +} + +#endif // !HWY_HAVE_SCALABLE + +// Tag for the same number of lanes as D, but with the LaneType T. +template <class T, class D> +using Rebind = typename D::template Rebind<T>; + +template <class D> +using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>; +template <class D> +using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>; +template <class D> +using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>; + +// Tag for the same total size as D, but with the LaneType T. +template <class T, class D> +using Repartition = typename D::template Repartition<T>; + +template <class D> +using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>; +template <class D> +using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>; + +// Tag for the same lane type as D, but half the lanes. +template <class D> +using Half = typename D::Half; + +// Tag for the same lane type as D, but twice the lanes. +template <class D> +using Twice = typename D::Twice; + +// ------------------------------ Choosing overloads (SFINAE) + +// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T. +#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>) +#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>) +#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>) +#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>) +#define HWY_IF_SPECIAL_FLOAT_D(D) HWY_IF_SPECIAL_FLOAT(TFromD<D>) +#define HWY_IF_NOT_SPECIAL_FLOAT_D(D) HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>) + +#define HWY_IF_T_SIZE_D(D, bytes) HWY_IF_T_SIZE(TFromD<D>, bytes) +#define HWY_IF_NOT_T_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE(TFromD<D>, bytes) +#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \ + HWY_IF_T_SIZE_ONE_OF(TFromD<D>, bit_array) + +#define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes) +#define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes) +#define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes) +#define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \ + HWY_IF_LANES_PER_BLOCK( \ + TFromD<D>, HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>)), lanes) + +#define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr +#define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr + +#define HWY_IF_U8_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint8_t>()>* = nullptr +#define HWY_IF_U16_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint16_t>()>* = nullptr +#define HWY_IF_U32_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint32_t>()>* = nullptr +#define HWY_IF_U64_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint64_t>()>* = nullptr + +#define HWY_IF_I8_D(D) hwy::EnableIf<IsSame<TFromD<D>, int8_t>()>* = nullptr +#define HWY_IF_I16_D(D) hwy::EnableIf<IsSame<TFromD<D>, int16_t>()>* = nullptr +#define HWY_IF_I32_D(D) hwy::EnableIf<IsSame<TFromD<D>, int32_t>()>* = nullptr +#define HWY_IF_I64_D(D) hwy::EnableIf<IsSame<TFromD<D>, int64_t>()>* = nullptr + +// Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float/double +// overloads. +#define HWY_IF_UI32_D(D) \ + hwy::EnableIf<IsSame<TFromD<D>, uint32_t>() || \ + IsSame<TFromD<D>, int32_t>()>* = nullptr +#define HWY_IF_UI64_D(D) \ + hwy::EnableIf<IsSame<TFromD<D>, uint64_t>() || \ + IsSame<TFromD<D>, int64_t>()>* = nullptr + +#define HWY_IF_BF16_D(D) \ + hwy::EnableIf<IsSame<TFromD<D>, bfloat16_t>()>* = nullptr +#define HWY_IF_F16_D(D) hwy::EnableIf<IsSame<TFromD<D>, float16_t>()>* = nullptr +#define HWY_IF_F32_D(D) hwy::EnableIf<IsSame<TFromD<D>, float>()>* = nullptr +#define HWY_IF_F64_D(D) hwy::EnableIf<IsSame<TFromD<D>, double>()>* = nullptr + +#define HWY_IF_V_SIZE_D(D, bytes) \ + HWY_IF_V_SIZE(TFromD<D>, HWY_MAX_LANES_D(D), bytes) +#define HWY_IF_V_SIZE_LE_D(D, bytes) \ + HWY_IF_V_SIZE_LE(TFromD<D>, HWY_MAX_LANES_D(D), bytes) +#define HWY_IF_V_SIZE_GT_D(D, bytes) \ + HWY_IF_V_SIZE_GT(TFromD<D>, HWY_MAX_LANES_D(D), bytes) + +// Same, but with a vector argument. ops/*-inl.h define their own TFromV. +#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>) +#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>) +#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>) +#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(TFromV<V>) +#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \ + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromV<V>) + +#define HWY_IF_T_SIZE_V(V, bytes) HWY_IF_T_SIZE(TFromV<V>, bytes) +#define HWY_IF_NOT_T_SIZE_V(V, bytes) HWY_IF_NOT_T_SIZE(TFromV<V>, bytes) +#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \ + HWY_IF_T_SIZE_ONE_OF(TFromV<V>, bit_array) + +#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>) +#define HWY_IF_V_SIZE_V(V, bytes) \ + HWY_IF_V_SIZE(TFromV<V>, HWY_MAX_LANES_V(V), bytes) +#define HWY_IF_V_SIZE_LE_V(V, bytes) \ + HWY_IF_V_SIZE_LE(TFromV<V>, HWY_MAX_LANES_V(V), bytes) +#define HWY_IF_V_SIZE_GT_V(V, bytes) \ + HWY_IF_V_SIZE_GT(TFromV<V>, HWY_MAX_LANES_V(V), bytes) + +// Old names (deprecated) +#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes) +#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes) + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE diff --git a/third_party/highway/hwy/ops/tuple-inl.h b/third_party/highway/hwy/ops/tuple-inl.h new file mode 100644 index 0000000000..61eb60d842 --- /dev/null +++ b/third_party/highway/hwy/ops/tuple-inl.h @@ -0,0 +1,86 @@ +// Copyright 2023 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Tuple support. Included by those ops/* that lack native tuple types, after +// they define VFromD and before they use the tuples e.g. for LoadInterleaved2. +// Assumes we are already in the HWY_NAMESPACE and under an include guard. + +// If viewing this header standalone, define VFromD to avoid IDE warnings. +// This is normally set by set_macros-inl.h before this header is included. +#if !defined(HWY_NAMESPACE) +#include "hwy/base.h" +template <class D> +using VFromD = int; +#endif + +// On SVE, Vec2..4 are aliases to built-in types. +template <class D> +struct Vec2 { + VFromD<D> v0; + VFromD<D> v1; +}; + +template <class D> +struct Vec3 { + VFromD<D> v0; + VFromD<D> v1; + VFromD<D> v2; +}; + +template <class D> +struct Vec4 { + VFromD<D> v0; + VFromD<D> v1; + VFromD<D> v2; + VFromD<D> v3; +}; + +// D arg is unused but allows deducing D. +template <class D> +HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) { + return Vec2<D>{v0, v1}; +} + +template <class D> +HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) { + return Vec3<D>{v0, v1, v2}; +} + +template <class D> +HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, + VFromD<D> v3) { + return Vec4<D>{v0, v1, v2, v3}; +} + +template <size_t kIndex, class D> +HWY_API VFromD<D> Get2(Vec2<D> tuple) { + static_assert(kIndex < 2, "Tuple index out of bounds"); + return kIndex == 0 ? tuple.v0 : tuple.v1; +} + +template <size_t kIndex, class D> +HWY_API VFromD<D> Get3(Vec3<D> tuple) { + static_assert(kIndex < 3, "Tuple index out of bounds"); + return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2; +} + +template <size_t kIndex, class D> +HWY_API VFromD<D> Get4(Vec4<D> tuple) { + static_assert(kIndex < 4, "Tuple index out of bounds"); + return kIndex == 0 ? tuple.v0 + : kIndex == 1 ? tuple.v1 + : kIndex == 2 ? tuple.v2 + : tuple.v3; +} diff --git a/third_party/highway/hwy/ops/wasm_128-inl.h b/third_party/highway/hwy/ops/wasm_128-inl.h new file mode 100644 index 0000000000..ff0388fe44 --- /dev/null +++ b/third_party/highway/hwy/ops/wasm_128-inl.h @@ -0,0 +1,5060 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit WASM vectors and operations. +// External include guard in highway.h - see comment there. + +#include <wasm_simd128.h> + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" + +#ifdef HWY_WASM_OLD_NAMES +#define wasm_i8x16_shuffle wasm_v8x16_shuffle +#define wasm_i16x8_shuffle wasm_v16x8_shuffle +#define wasm_i32x4_shuffle wasm_v32x4_shuffle +#define wasm_i64x2_shuffle wasm_v64x2_shuffle +#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 +#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 +#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 +#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 +#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 +#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 +#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 +#define wasm_u8x16_add_sat wasm_u8x16_add_saturate +#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate +#define wasm_u16x8_add_sat wasm_u16x8_add_saturate +#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate +#define wasm_i8x16_add_sat wasm_i8x16_add_saturate +#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate +#define wasm_i16x8_add_sat wasm_i16x8_add_saturate +#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +#if HWY_TARGET == HWY_WASM_EMU256 +template <typename T> +using Full256 = Simd<T, 32 / sizeof(T), 0>; +#endif + +namespace detail { + +template <typename T> +struct Raw128 { + using type = __v128_u; +}; +template <> +struct Raw128<float> { + using type = __f32x4; +}; + +} // namespace detail + +template <typename T, size_t N = 16 / sizeof(T)> +class Vec128 { + using Raw = typename detail::Raw128<T>::type; + + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = N; // only for DFromV + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template <typename T> +using Vec64 = Vec128<T, 8 / sizeof(T)>; + +template <typename T> +using Vec32 = Vec128<T, 4 / sizeof(T)>; + +template <typename T> +using Vec16 = Vec128<T, 2 / sizeof(T)>; + +// FF..FF or 0. +template <typename T, size_t N = 16 / sizeof(T)> +struct Mask128 { + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = N; // only for DFromM + + typename detail::Raw128<T>::type raw; +}; + +template <class V> +using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; + +template <class M> +using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; + +template <class V> +using TFromV = typename V::PrivateT; + +// ------------------------------ Zero + +// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_D(D)> +HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_FLOAT_D(D)> +HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)}; +} + +template <class D> +using VFromD = decltype(Zero(D())); + +// ------------------------------ Tuple (VFromD) +#include "hwy/ops/tuple-inl.h" + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } +HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { + return static_cast<__v128_u>(v); +} +HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { + return static_cast<__v128_u>(v); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { + return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template <typename T> +struct BitCastFromInteger128 { + HWY_INLINE __v128_u operator()(__v128_u v) { return v; } +}; +template <> +struct BitCastFromInteger128<float> { + HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } +}; + +template <class D> +HWY_INLINE VFromD<D> BitCastFromByte(D d, Vec128<uint8_t, d.MaxBytes()> v) { + return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)}; +} + +} // namespace detail + +template <class D, typename FromT> +HWY_API VFromD<D> BitCast(D d, + Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ ResizeBitCast + +template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), + HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const Repartition<uint8_t, decltype(d)> du8_to; + return BitCast(d, VFromD<decltype(du8_to)>{detail::BitCastToInteger(v.raw)}); +} + +// ------------------------------ Set + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{wasm_i8x16_splat(static_cast<int8_t>(t))}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{wasm_i16x8_splat(static_cast<int16_t>(t))}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{wasm_i32x4_splat(static_cast<int32_t>(t))}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{wasm_i64x2_splat(static_cast<int64_t>(t))}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> Set(D /* tag */, const float t) { + return VFromD<D>{wasm_f32x4_splat(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// For all vector sizes. +template <class D> +HWY_API VFromD<D> Undefined(D d) { + return Zero(d); +} + +HWY_DIAGNOSTICS(pop) + +// For all vector sizes. +template <class D, typename T = TFromD<D>, typename T2> +HWY_API VFromD<D> Iota(D d, const T2 first) { + HWY_ALIGN T lanes[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + lanes[i] = + AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i); + } + return Load(d, lanes); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)}; +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)}; +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)}; +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)}; +} + +// ------------------------------ Absolute value + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +template <size_t N> +HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { + return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { + return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) { + return Vec128<float, N>{wasm_f32x4_abs(v.raw)}; +} + +// ------------------------------ Shift lanes by constant #bits + +// Unsigned +template <int kBits, size_t N> +HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { + return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) { + return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { + return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { + return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) { + return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) { + return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)}; +} + +// Signed +template <int kBits, size_t N> +HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { + return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) { + return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)}; +} + +// 8-bit +template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); +} + +template <int kBits, size_t N> +HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<uint8_t, N> shifted{ + ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template <int kBits, size_t N> +HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ RotateRight (ShiftRight, Or) +template <int kBits, typename T, size_t N> +HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +// ------------------------------ Shift lanes by same variable #bits + +// After https://reviews.llvm.org/D108415 shift argument became unsigned. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +// Unsigned +template <size_t N> +HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v, + const int bits) { + return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v, + const int bits) { + return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v, + const int bits) { + return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v, + const int bits) { + return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v, + const int bits) { + return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v, + const int bits) { + return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v, + const int bits) { + return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v, + const int bits) { + return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v, + const int bits) { + return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v, + const int bits) { + return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v, + const int bits) { + return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v, + const int bits) { + return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)}; +} + +// 8-bit +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<T, N> shifted{ + ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw}; + return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); +} + +template <size_t N> +HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v, + const int bits) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<uint8_t, N> shifted{ + ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw}; + return shifted & Set(d8, 0xFF >> bits); +} + +template <size_t N> +HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ignore Wsign-conversion +HWY_DIAGNOSTICS(pop) + +// ------------------------------ Minimum + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { + // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. + const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)); + const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)); + const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)); + const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)); + alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)}; + return Vec128<uint64_t, N>{wasm_v128_load(min)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { + alignas(16) int64_t min[4]; + min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), + wasm_i64x2_extract_lane(b.raw, 0)); + min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), + wasm_i64x2_extract_lane(b.raw, 1)); + return Vec128<int64_t, N>{wasm_v128_load(min)}; +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) { + // Equivalent to a < b ? a : b (taking into account our swapped arg order, + // so that Min(NaN, x) is x to match x86). + return Vec128<float, N>{wasm_f32x4_pmin(b.raw, a.raw)}; +} + +// ------------------------------ Maximum + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { + // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. + const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)); + const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)); + const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)); + const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)); + alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)}; + return Vec128<uint64_t, N>{wasm_v128_load(max)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { + alignas(16) int64_t max[2]; + max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), + wasm_i64x2_extract_lane(b.raw, 0)); + max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), + wasm_i64x2_extract_lane(b.raw, 1)); + return Vec128<int64_t, N>{wasm_v128_load(max)}; +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) { + // Equivalent to b < a ? a : b (taking into account our swapped arg order, + // so that Max(NaN, x) is x to match x86). + return Vec128<float, N>{wasm_f32x4_pmax(b.raw, a.raw)}; +} + +// ------------------------------ Integer multiplication + +// Unsigned +template <size_t N> +HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +template <size_t N> +HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw); + const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw); + // TODO(eustas): shift-right + narrow? + return Vec128<uint16_t, N>{ + wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw); + const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw); + // TODO(eustas): shift-right + narrow? + return Vec128<int16_t, N>{ + wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; +} + +template <size_t N> +HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and returns the double-width result. +template <size_t N> +HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); + const auto ae = wasm_v128_and(a.raw, kEvenMask); + const auto be = wasm_v128_and(b.raw, kEvenMask); + return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); + const auto ae = wasm_v128_and(a.raw, kEvenMask); + const auto be = wasm_v128_and(b.raw, kEvenMask); + return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)}; +} + +// ------------------------------ Negate + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { + return Xor(v, SignBit(DFromV<decltype(v)>())); +} + +template <size_t N> +HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) { + return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) { + return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)}; +} + +// ------------------------------ Floating-point mul / div + +template <size_t N> +HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) { + return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)}; +} + +// Approximate reciprocal +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { + const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)}; + return one / v; +} + +// Absolute value of difference. +template <size_t N> +HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +template <size_t N> +HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul, + const Vec128<float, N> x, + const Vec128<float, N> add) { + return mul * x + add; +} + +// Returns add - mul * x +template <size_t N> +HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul, + const Vec128<float, N> x, + const Vec128<float, N> add) { + return add - mul * x; +} + +// Returns mul * x - sub +template <size_t N> +HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul, + const Vec128<float, N> x, + const Vec128<float, N> sub) { + return mul * x - sub; +} + +// Returns -mul * x - sub +template <size_t N> +HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul, + const Vec128<float, N> x, + const Vec128<float, N> sub) { + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +// Full precision square root +template <size_t N> +HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) { + return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)}; +} + +// Approximate reciprocal square root +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) { + // TODO(eustas): find cheaper a way to calculate this. + const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)}; + return one / Sqrt(v); +} + +// ------------------------------ Floating-point rounding + +// Toward nearest integer, ties to even +template <size_t N> +HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { + return Vec128<float, N>{wasm_f32x4_nearest(v.raw)}; +} + +// Toward zero, aka truncate +template <size_t N> +HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { + return Vec128<float, N>{wasm_f32x4_trunc(v.raw)}; +} + +// Toward +infinity, aka ceiling +template <size_t N> +HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { + return Vec128<float, N>{wasm_f32x4_ceil(v.raw)}; +} + +// Toward -infinity, aka floor +template <size_t N> +HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { + return Vec128<float, N>{wasm_f32x4_floor(v.raw)}; +} + +// ------------------------------ Floating-point classification +template <typename T, size_t N> +HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { + return v != v; +} + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>()))); +} + +// Returns whether normal/subnormal/zero. +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +// Mask and Vec are the same (true = FF..FF). +template <typename T, size_t N> +HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { + return Mask128<T, N>{v.raw}; +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +template <typename TFrom, size_t NFrom, class DTo> +HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); + return MFromD<DTo>{m.raw}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +// Unsigned +template <size_t N> +HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)}; +} + +// Float +template <size_t N> +HWY_API Mask128<float, N> operator==(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)}; +} + +// ------------------------------ Inequality + +// Unsigned +template <size_t N> +HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)}; +} + +// Float +template <size_t N> +HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)}; +} + +// ------------------------------ Strict inequality + +template <size_t N> +HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + const DFromV<decltype(a)> d; + const Repartition<uint32_t, decltype(d)> d32; + const auto a32 = BitCast(d32, a); + const auto b32 = BitCast(d32, b); + // If the upper halves are not equal, this is the answer. + const auto m_gt = a32 > b32; + + // Otherwise, the lower half decides. + const auto m_eq = a32 == b32; + const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); + const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi})); + + const auto gt = Or(lo_gt, m_gt); + // Copy result in upper 32 bits to lower 32 bits. + return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; +} + +template <size_t N> +HWY_API Mask128<float, N> operator>(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)}; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) { + return operator>(b, a); +} + +// ------------------------------ Weak inequality + +// Float >= +template <size_t N> +HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<int8_t, N> operator>=(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{wasm_i8x16_ge(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator>=(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{wasm_i16x8_ge(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator>=(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{wasm_i32x4_ge(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator>=(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Mask128<int64_t, N>{wasm_i64x2_ge(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<uint8_t, N> operator>=(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{wasm_u8x16_ge(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator>=(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{wasm_u16x8_ge(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator>=(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{wasm_u32x4_ge(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator>=(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Not(b > a); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator<=(const Vec128<T, N> a, const Vec128<T, N> b) { + return operator>=(b, a); +} + +// ------------------------------ FirstN (Iota, Lt) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API MFromD<D> FirstN(D d, size_t num) { + const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper. + using TI = TFromD<decltype(di)>; + return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num))); +} + +// ================================================== LOGICAL + +// ------------------------------ Not + +template <typename T, size_t N> +HWY_API Vec128<T, N> Not(Vec128<T, N> v) { + return Vec128<T, N>{wasm_v128_not(v.raw)}; +} + +// ------------------------------ And + +template <typename T, size_t N> +HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template <typename T, size_t N> +HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { + return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)}; +} + +// ------------------------------ Or + +template <typename T, size_t N> +HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)}; +} + +// ------------------------------ Xor3 + +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { + return Xor(x1, Xor(x2, x3)); +} + +// ------------------------------ Or3 + +template <typename T, size_t N> +HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template <typename T, size_t N> +HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { + return And(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { + return Or(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { + return Xor(a, b); +} + +// ------------------------------ CopySign + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn, + const Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + const auto msb = SignBit(DFromV<decltype(magn)>()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs, + const Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign)); +} + +// ------------------------------ BroadcastSignBit (compare) + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) { + return ShiftRight<sizeof(T) * 8 - 1>(v); +} +template <size_t N> +HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + return VecFromMask(d, v < Zero(d)); +} + +// ------------------------------ Mask + +template <class D> +HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { + return VFromD<D>{v.raw}; +} + +// mask ? yes : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; +} + +// mask ? yes : 0 +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { + return yes & VecFromMask(DFromV<decltype(yes)>(), mask); +} + +// mask ? 0 : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { + return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, + Vec128<T, N> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + + v = BitCast(d, BroadcastSignBit(BitCast(di, v))); + return IfThenElse(MaskFromVec(v), yes, no); +} + +template <typename T, size_t N, HWY_IF_FLOAT(T)> +HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const auto zero = Zero(d); + return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero); +} + +// ------------------------------ Mask logical + +template <typename T, size_t N> +HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { + const DFromM<decltype(m)> d; + return MaskFromVec(Not(VecFromMask(d, m))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { + const DFromM<decltype(a)> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ------------------------------ Shl (BroadcastSignBit, IfThenElse) + +// The x86 multiply-by-Pow2() trick will not work because WASM saturates +// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a +// scalar count operand, per-lane shift instructions would require extract_lane +// for each lane, and hoping that shuffle is correctly mapped to a native +// instruction. Using non-vector shifts would incur a store-load forwarding +// stall when loading the result vector. We instead test bits of the shift +// count to "predicate" a shift of the entire vector by a constant. + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + Mask128<T, N> mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned<decltype(d)>(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<5>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + Mask128<T, N> mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned<decltype(d)>(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<12>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + Mask128<T, N> mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned<decltype(d)>(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<27>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<16>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + alignas(16) T lanes[2]; + alignas(16) T bits_lanes[2]; + Store(v, d, lanes); + Store(bits, d, bits_lanes); + lanes[0] <<= bits_lanes[0]; + lanes[1] <<= bits_lanes[1]; + return Load(d, lanes); +} + +// ------------------------------ Shr (BroadcastSignBit, IfThenElse) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + Mask128<T, N> mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned<decltype(d)>(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<5>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + Mask128<T, N> mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned<decltype(d)>(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<12>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { + const DFromV<decltype(v)> d; + Mask128<T, N> mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned<decltype(d)>(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<27>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<16>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { + return Vec128<T>{wasm_v128_load(aligned)}; +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { + VFromD<D> v; + CopyBytes<d.MaxBytes()>(p, &v); + return v; +} + +// LoadU == Load. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} + +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, + const T* HWY_RESTRICT aligned) { + return IfThenElse(m, Load(d, aligned), v); +} + +// ------------------------------ Store + +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { + wasm_v128_store(aligned, v.raw); +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { + CopyBytes<d.MaxBytes()>(&v, p); +} + +template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F32_D(D)> +HWY_API void Store(Vec128<float, 1> v, D /* tag */, float* HWY_RESTRICT p) { + *p = wasm_f32x4_extract_lane(v.raw, 0); +} + +// StoreU == Store. +template <class D> +HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { + Store(v, d, p); +} + +template <class D> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); +} + +// ------------------------------ Non-temporal stores + +// Same as aligned stores on non-x86. + +template <class D> +HWY_API void Stream(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { + wasm_v128_store(aligned, v.raw); +} + +// ------------------------------ Scatter (Store) + +template <class D, typename T = TFromD<D>, class VI> +HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + HWY_ALIGN TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template <class D, typename T = TFromD<D>, class VI> +HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + HWY_ALIGN TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + for (size_t i = 0; i < MaxLanes(d); ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) + +template <class D, typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + HWY_ALIGN T lanes[MaxLanes(d)]; + const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template <class D, typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + HWY_ALIGN TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + HWY_ALIGN T lanes[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +// ================================================== SWIZZLE + +// ------------------------------ ExtractLane + +namespace detail { + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane)); +} +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + return static_cast<T>(wasm_i16x8_extract_lane(v.raw, kLane)); +} +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane)); +} +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane)); +} + +template <size_t kLane, size_t N> +HWY_INLINE float ExtractLane(const Vec128<float, N> v) { + return wasm_f32x4_extract_lane(v.raw, kLane); +} + +} // namespace detail + +// One overload per vector length just in case *_extract_lane raise compile +// errors if their argument is out of bounds (even if that would never be +// reached at runtime). +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return GetLane(v); +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + } + } +#endif + alignas(16) T lanes[2]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + } + } +#endif + alignas(16) T lanes[4]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + } + } +#endif + alignas(16) T lanes[8]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + case 8: + return detail::ExtractLane<8>(v); + case 9: + return detail::ExtractLane<9>(v); + case 10: + return detail::ExtractLane<10>(v); + case 11: + return detail::ExtractLane<11>(v); + case 12: + return detail::ExtractLane<12>(v); + case 13: + return detail::ExtractLane<13>(v); + case 14: + return detail::ExtractLane<14>(v); + case 15: + return detail::ExtractLane<15>(v); + } + } +#endif + alignas(16) T lanes[16]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +// ------------------------------ GetLane +template <typename T, size_t N> +HWY_API T GetLane(const Vec128<T, N> v) { + return detail::ExtractLane<0>(v); +} + +// ------------------------------ InsertLane + +namespace detail { + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128<T, N>{ + wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))}; +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128<T, N>{ + wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))}; +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128<T, N>{ + wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))}; +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128<T, N>{ + wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))}; +} + +template <size_t kLane, size_t N> +HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)}; +} + +template <size_t kLane, size_t N> +HWY_INLINE Vec128<double, N> InsertLane(const Vec128<double, N> v, double t) { + static_assert(kLane < 2, "Lane index out of bounds"); + return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)}; +} + +} // namespace detail + +// Requires one overload per vector length because InsertLane<3> may be a +// compile error if it calls wasm_f64x2_replace_lane. + +template <typename T> +HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + return Set(DFromV<decltype(v)>(), t); +} + +template <typename T> +HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template <typename T> +HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[4]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template <typename T> +HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[8]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template <typename T> +HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + case 8: + return detail::InsertLane<8>(v, t); + case 9: + return detail::InsertLane<9>(v, t); + case 10: + return detail::InsertLane<10>(v, t); + case 11: + return detail::InsertLane<11>(v, t); + case 12: + return detail::InsertLane<12>(v, t); + case 13: + return detail::InsertLane<13>(v, t); + case 14: + return detail::InsertLane<14>(v, t); + case 15: + return detail::InsertLane<15>(v, t); + } + } +#endif + const DFromV<decltype(v)> d; + alignas(16) T lanes[16]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ------------------------------ LowerHalf + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { + return VFromD<D>{v.raw}; +} +template <typename T, size_t N> +HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { + return Vec128<T, N / 2>{v.raw}; +} + +// ------------------------------ ShiftLeftBytes + +// 0x01..0F, kBytes = 1 => 0x02..0F00 +template <int kBytes, class D> +HWY_API VFromD<D> ShiftLeftBytes(D /* tag */, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const __i8x16 zero = wasm_i8x16_splat(0); + switch (kBytes) { + case 0: + return v; + + case 1: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14)}; + + case 2: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13)}; + + case 3: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12)}; + + case 4: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11)}; + + case 5: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10)}; + + case 6: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; + + case 7: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; + + case 8: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; + + case 9: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; + + case 10: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; + + case 11: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; + + case 12: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; + + case 13: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; + + case 14: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 0, + 1)}; + + case 15: + return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 0)}; + } + return VFromD<D>{zero}; +} + +template <int kBytes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { + return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftLeftLanes + +template <int kLanes, class D> +HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); + return BitCast(d, ShiftLeftBytes<kBytes>(BitCast(d8, v))); +} + +template <int kLanes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) { + return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftRightBytes +namespace detail { + +// Helper function allows zeroing invalid lanes in caller. +template <int kBytes, typename T, size_t N> +HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const __i8x16 zero = wasm_i8x16_splat(0); + + switch (kBytes) { + case 0: + return v.raw; + + case 1: + return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16); + + case 2: + return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16); + + case 3: + return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16, 16); + + case 4: + return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 16, 16, 16); + + case 5: + return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 16, 16, 16, 16); + + case 6: + return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16); + + case 7: + return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16, 16); + + case 8: + return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 9: + return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 10: + return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 11: + return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 12: + return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 13: + return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 14: + return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 15: + return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + case 16: + return zero; + } +} + +} // namespace detail + +// 0x01..0F, kBytes = 1 => 0x0001..0E +template <int kBytes, class D> +HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { + // For partial vectors, clear upper lanes so we shift in zeros. + if (d.MaxBytes() != 16) { + const Full128<TFromD<D>> dfull; + const VFromD<decltype(dfull)> vfull{v.raw}; + v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; + } + return VFromD<D>{detail::ShrBytes<kBytes>(v)}; +} + +// ------------------------------ ShiftRightLanes +template <int kLanes, class D> +HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); + return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +template <class D, typename T = TFromD<D>> +HWY_API Vec64<T> UpperHalf(D /* tag */, const Vec128<T> v) { + return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { + return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); +} + +// ------------------------------ CombineShiftRightBytes + +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec128<T> CombineShiftRightBytes(D /* tag */, Vec128<T> hi, + Vec128<T> lo) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + switch (kBytes) { + case 0: + return lo; + + case 1: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16)}; + + case 2: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17)}; + + case 3: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18)}; + + case 4: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19)}; + + case 5: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20)}; + + case 6: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21)}; + + case 7: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22)}; + + case 8: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23)}; + + case 9: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24)}; + + case 10: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25)}; + + case 11: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26)}; + + case 12: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27)}; + + case 13: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28)}; + + case 14: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29)}; + + case 15: + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30)}; + } + return hi; +} + +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kSize = d.MaxBytes(); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition<uint8_t, decltype(d)> d8; + using V8 = Vec128<uint8_t>; + const DFromV<V8> dfull8; + const Repartition<TFromD<D>, decltype(dfull8)> dfull; + const V8 hi8{BitCast(d8, hi).raw}; + // Move into most-significant bytes + const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); + const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); + return VFromD<D>{BitCast(dfull, r).raw}; +} + +// ------------------------------ Broadcast/splat any lane + +template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, + kLane, kLane, kLane, kLane, kLane)}; +} + +template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<T, N>{ + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; +} + +template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; +} + +// ------------------------------ TableLookupBytes + +// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. +// lane indices in [0, 16). +template <typename T, size_t N, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, + const Vec128<TI, NI> from) { + return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)}; +} + +template <typename T, size_t N, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes, + const Vec128<TI, NI> from) { + const DFromV<decltype(from)> d; + // Mask size must match vector type, so cast everything to this type. + Repartition<int8_t, decltype(d)> di8; + Repartition<int8_t, DFromV<decltype(bytes)>> d_bytes8; + const auto msb = BitCast(di8, from) < Zero(di8); + const auto lookup = + TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); + return BitCast(d, IfThenZeroElse(msb, lookup)); +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template <typename T, size_t N> +HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; +} + +// These are used by generic_ops-inl to implement LoadInterleaved3. +namespace detail { + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a, + const Vec128<T, N> b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)}; +} + +} // namespace detail + +// Swap 64-bit halves +template <typename T> +HWY_API Vec128<T> Shuffle01(const Vec128<T> v) { + static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); + return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} +template <typename T> +HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} + +// Rotate right 32 bits +template <typename T> +HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; +} + +// Rotate left 32 bits +template <typename T> +HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; +} + +// Reverse +template <typename T> +HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template <typename T, size_t N = 16 / sizeof(T)> +struct Indices128 { + __v128_u raw; +}; + +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + return Iota(d8, 0); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + return Load(d8, kBroadcastLaneBytes); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + return Load(d8, kBroadcastLaneBytes); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( + D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; + return Load(d8, kBroadcastLaneBytes); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + return Zero(d8); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + return Load(d8, kByteOffsets); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + return Load(d8, kByteOffsets); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + return Load(d8, kByteOffsets); +} + +} // namespace detail + +template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_D(D, 1)> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( + D d, Vec128<TI, MaxLanes(D())> vec) { + using T = TFromD<D>; + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + HWY_DASSERT(AllTrue( + du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); +#endif + + (void)d; + return Indices128<TFromD<D>, MaxLanes(D())>{vec.raw}; +} + +template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> +HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( + D d, Vec128<TI, MaxLanes(D())> vec) { + using T = TFromD<D>; + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + HWY_DASSERT(AllTrue( + du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); +#endif + + const Repartition<uint8_t, decltype(d)> d8; + using V8 = VFromD<decltype(d8)>; + + // Broadcast each lane index to all bytes of T and shift to bytes + const V8 lane_indices = TableLookupBytes( + BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); + constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); + const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); + const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); + return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI> +HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( + D d, const TI* idx) { + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { + using TI = MakeSigned<T>; + const DFromV<decltype(v)> d; + const Rebind<TI, decltype(d)> di; + return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw})); +} + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, + Indices128<T, N> idx) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; +// TableLookupLanes currently requires table and index vectors to be the same +// size, though a half-length index vector would be sufficient here. +#if HWY_IS_MSAN + const Vec128<T, N> idx_vec{idx.raw}; + const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; +#else + // We only keep LowerHalf of the result, which is valid in idx. + const Indices128<T, N * 2> idx2{idx.raw}; +#endif + return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); +} + +template <typename T> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T> idx) { + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> du8; + + const VFromD<decltype(du8)> byte_idx{idx.raw}; + const auto byte_idx_mod = byte_idx & Set(du8, uint8_t{0x0F}); + // If ANDing did not change the index, it is for the lower half. + const auto is_lo = (byte_idx == byte_idx_mod); + + return BitCast(d, IfThenElse(is_lo, TableLookupBytes(a, byte_idx_mod), + TableLookupBytes(b, byte_idx_mod))); +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) + +// Single lane: no change +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { + return v; +} + +// 32-bit x2: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec64<T> Reverse(D /* tag */, const Vec64<T> v) { + return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw}; +} + +// 64-bit x2: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) { + return Shuffle01(v); +} + +// 32-bit x2: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) { + return Shuffle0123(v); +} + +// 16-bit +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { + const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32; + return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { + static constexpr int kN = 16 + Lanes(d); + return VFromD<D>{wasm_i8x16_shuffle( + v.raw, v.raw, + // kN is adjusted to ensure we have valid indices for all lengths. + kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9, + kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)}; +} + +// ------------------------------ Reverse2 + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { + const RepartitionToWide<RebindToUnsigned<decltype(d)>> dw; + return BitCast(d, RotateRight<16>(BitCast(dw, v))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) { + return Shuffle2301(v); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { + return VFromD<D>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)}; +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { + return Shuffle0123(v); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D>) { + HWY_ASSERT(0); // don't have 8 u64 lanes +} + +// ------------------------------ Reverse8 + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { + return Reverse(d, v); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D>) { + HWY_ASSERT(0); // don't have 8 lanes for > 16-bit lanes +} + +// ------------------------------ InterleaveLower + +template <size_t N> +HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_i8x16_shuffle( + a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{ + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +template <size_t N> +HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_shuffle( + a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{ + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +template <size_t N> +HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, + Vec128<float, N> b) { + return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} + +template <size_t N> +HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a, + Vec128<double, N> b) { + return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +// Additional overload for the optional tag (all vector lengths). +template <class D> +HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { + +template <size_t N> +HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, + 26, 11, 27, 12, 28, 13, 29, 14, + 30, 15, 31)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{ + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +template <size_t N> +HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, + 26, 11, 27, 12, 28, 13, 29, 14, + 30, 15, 31)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{ + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +template <size_t N> +HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a, + Vec128<float, N> b) { + return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} + +template <size_t N> +HWY_API Vec128<double, N> InterleaveUpper(Vec128<double, N> a, + Vec128<double, N> b) { + return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +} // namespace detail + +// Full +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) { + return detail::InterleaveUpper(a, b); +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { + const Half<decltype(d)> d2; + return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, + VFromD<D>{UpperHalf(d2, b).raw}); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// N = N/2 + N/2 (upper half undefined) +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> +HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { + const Half<decltype(d)> dh; + const RebindToUnsigned<decltype(dh)> duh; + // Treat half-width input as one lane, and expand to two lanes. + using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; + const VU lo{BitCast(duh, lo_half).raw}; + const VU hi{BitCast(duh, hi_half).raw}; + return BitCast(d, InterleaveLower(lo, hi)); +} + +// ------------------------------ ZeroExtendVector (IfThenElseZero) +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { + const Half<D> dh; + return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); +} + +// ------------------------------ ConcatLowerLower +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatLowerLower(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; +} + +// ------------------------------ ConcatUpperUpper +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatUpperUpper(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; +} + +// ------------------------------ ConcatLowerUpper +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) { + return CombineShiftRightBytes<8>(d, hi, lo); +} + +// ------------------------------ ConcatUpperLower +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatUpperLower(D d, Vec128<T> hi, Vec128<T> lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ Concat partial (Combine, LowerHalf) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi, + const VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); +} + +// ------------------------------ ConcatOdd + +// 8-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31)}; +} + +// 8-bit x8 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half. + return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21, + 23, 1, 3, 5, 7, 17, 19, 21, 23)}; +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatOdd(D /* tag */, Vec32<T> hi, Vec32<T> lo) { + // Don't care about upper 3/4. + return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17, + 19, 1, 3, 17, 19, 1, 3, 17, 19)}; +} + +// 16-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)}; +} + +// 16-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half. + return Vec128<T, 4>{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)}; +} + +// 32-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; +} + +// Any T x2 +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// 8-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30)}; +} + +// 8-bit x8 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half. + return Vec64<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22, + 0, 2, 4, 6, 16, 18, 20, 22)}; +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatEven(D /* tag */, Vec32<T> hi, Vec32<T> lo) { + // Don't care about upper 3/4. + return Vec32<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18, + 0, 2, 16, 18, 0, 2, 16, 18)}; +} + +// 16-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)}; +} + +// 16-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) { + // Don't care about upper half. + return Vec64<T>{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)}; +} + +// 32-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) { + return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; +} + +// Any T x2 +template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ DupEven (InterleaveLower) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { + return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) { + return InterleaveLower(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { + return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { + return InterleaveUpper(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ OddEven + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a, + const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t mask[16] = { + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a, + const Vec128<T, N> b) { + return Vec128<T, N>{ + wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a, + const Vec128<T, N> b) { + return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a, + const Vec128<T, N> b) { + return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { + return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b); +} +template <size_t N> +HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; +} + +// ------------------------------ OddEvenBlocks +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template <typename T, size_t N> +HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { + return v; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template <class D> +HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { + return v; +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + return VFromD<D>{ + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { + return VFromD<D>{ + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; +} + +// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to +// TFromD<D> +template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D), + HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> PromoteTo(D d, V v) { + const Rebind<uint32_t, decltype(d)> du32; + return PromoteTo(d, PromoteTo(du32, v)); +} + +// Signed: replicate sign bit. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { + return VFromD<D>{wasm_i16x8_extend_low_i8x16(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>{wasm_i32x4_extend_low_i16x8(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{wasm_i64x2_extend_low_i32x4(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { + return VFromD<D>{ + wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; +} + +// I8/I16 to I64: First, promote to I32, and then promote to I64 +template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D), + HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> PromoteTo(D d, V v) { + const Rebind<int32_t, decltype(d)> di32; + return PromoteTo(d, PromoteTo(di32, v)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) { + const RebindToSigned<decltype(df32)> di32; + const RebindToUnsigned<decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + // Expand to u32 so we can shift. + const VU32 bits16 = PromoteTo(du32, VFromD<Rebind<uint16_t, D>>{v.raw}); + const VU32 sign = ShiftRight<15>(bits16); + const VU32 biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const VU32 mantissa = bits16 & Set(du32, 0x3FF); + const VU32 subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const VU32 biased_exp32 = biased_exp + Set(du32, 127 - 15); + const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa); + const VU32 normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const VU32 bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{wasm_f64x2_convert_low_i32x4(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return VFromD<D>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return VFromD<D>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; +} + +template <class D, HWY_IF_UNSIGNED_D(D), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint32_t, D>> v) { + const DFromV<decltype(v)> du32; + const RebindToSigned<decltype(du32)> di32; + return DemoteTo(dn, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) { + const DFromV<decltype(v)> du16; + const RebindToSigned<decltype(du16)> di16; + return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF)))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> +HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { + const RebindToUnsigned<decltype(df16)> du16; + const Rebind<uint32_t, decltype(du16)> du; + const RebindToSigned<decltype(du)> di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return VFromD<D>{DemoteTo(du16, bits16).raw}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)> +HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { + const Rebind<int32_t, decltype(dbf16)> di32; + const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right + const Rebind<uint16_t, decltype(dbf16)> du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { + return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D), + class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) { + const RebindToUnsigned<decltype(dbf16)> du16; + const Repartition<uint32_t, decltype(dbf16)> du32; + const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes +// above 2*N. +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec32<int16_t> ReorderDemote2To(D dn, Vec32<int32_t> a, + Vec32<int32_t> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> ReorderDemote2To(D dn, Vec64<int32_t> a, + Vec64<int32_t> b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const Vec128<int16_t> v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, + Vec128<int32_t> b) { + return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> ReorderDemote2To(D dn, Vec32<int32_t> a, + Vec32<int32_t> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> ReorderDemote2To(D dn, Vec64<int32_t> a, + Vec64<int32_t> b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const Vec128<int16_t> v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, + Vec128<int32_t> b) { + return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a, + Vec128<uint32_t> b) { + const DFromV<decltype(a)> du32; + const RebindToSigned<decltype(du32)> di32; + const auto max_i32 = Set(du32, 0x7FFFFFFFu); + + const auto clamped_a = BitCast(di32, Min(a, max_i32)); + const auto clamped_b = BitCast(di32, Min(b, max_i32)); + return ReorderDemote2To(dn, clamped_a, clamped_b); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a, + VFromD<Repartition<uint32_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +// Specializations for partial vectors because i8x16_narrow_i16x8 sets lanes +// above 2*N. +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, + VFromD<Repartition<int16_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> ReorderDemote2To(D dn, Vec64<int16_t> a, + Vec64<int16_t> b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const Vec128<int8_t> v_full{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, + Vec128<int16_t> b) { + return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, + VFromD<Repartition<int16_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> ReorderDemote2To(D dn, Vec64<int16_t> a, + Vec64<int16_t> b) { + const Twice<decltype(dn)> dn_full; + const Repartition<uint32_t, decltype(dn_full)> du32_full; + + const Vec128<uint8_t> v_full{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; + const auto vu32_full = BitCast(du32_full, v_full); + return LowerHalf( + BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, + Vec128<int16_t> b) { + return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a, + Vec128<uint16_t> b) { + const DFromV<decltype(a)> du16; + const RebindToSigned<decltype(du16)> di16; + const auto max_i16 = Set(du16, 0x7FFFu); + + const auto clamped_a = BitCast(di16, Min(a, max_i16)); + const auto clamped_b = BitCast(di16, Min(b, max_i16)); + return ReorderDemote2To(dn, clamped_a, clamped_b); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a, + VFromD<Repartition<uint16_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +// For already range-limited input [0, 255]. +template <size_t N> +HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec128<uint8_t, N>{ + wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +// ------------------------------ Truncations + +template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)> +HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) { + // BitCast requires the same size; DTo might be u8x1 and v u16x1. + const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto; + return VFromD<DTo>{BitCast(dto, v).raw}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + const Full128<uint8_t> d; + const auto v1 = BitCast(d, v); + const auto v2 = ConcatEven(d, v1, v1); + const auto v4 = ConcatEven(d, v2, v2); + return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4)))); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + const Full128<uint16_t> d; + const auto v1 = BitCast(d, v); + const auto v2 = ConcatEven(d, v1, v1); + return LowerHalf(LowerHalf(ConcatEven(d, v2, v2))); +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + const Full128<uint32_t> d; + const auto v1 = BitCast(d, v); + return LowerHalf(ConcatEven(d, v1, v1)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d; + const auto v1 = Vec128<uint8_t>{v.raw}; + const auto v2 = ConcatEven(d, v1, v1); + const auto v3 = ConcatEven(d, v2, v2); + return VFromD<D>{v3.raw}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const Repartition<uint16_t, DFromV<decltype(v)>> d; + const auto v1 = Vec128<uint16_t>{v.raw}; + const auto v2 = ConcatEven(d, v1, v1); + return VFromD<D>{v2.raw}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + const Repartition<uint8_t, DFromV<decltype(v)>> d; + const auto v1 = Vec128<uint8_t>{v.raw}; + const auto v2 = ConcatEven(d, v1, v1); + return VFromD<D>{v2.raw}; +} + +// ------------------------------ Demotions to/from i64 + +namespace detail { +template <class D, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( + D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { + return v; +} + +template <class D, HWY_IF_SIGNED_D(D)> +HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( + D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { + const DFromV<decltype(v)> du64; + return And(v, + Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>()))); +} + +template <class D> +HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate( + D dn, VFromD<Rebind<uint64_t, D>> v) { + const Rebind<uint64_t, D> du64; + const RebindToSigned<decltype(du64)> di64; + constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) - + static_cast<int>(hwy::IsSigned<TFromD<D>>()); + + const auto too_big = BitCast( + du64, VecFromMask( + di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64)))); + return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); +} + +template <class D, class V> +HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) { + return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); +} + +} // namespace detail + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_SIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { + const DFromV<decltype(v)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const RebindToUnsigned<decltype(dn)> dn_u; + + // Negative values are saturated by first saturating their bitwise inverse + // and then inverting the saturation result + const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); + const auto saturated_vals = Xor( + invert_mask, + detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); + return BitCast(dn, TruncateTo(dn_u, saturated_vals)); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { + const DFromV<decltype(v)> di64; + const RebindToUnsigned<decltype(di64)> du64; + + const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); + return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { + return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a, + VFromD<Repartition<int64_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, + VFromD<Repartition<uint64_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, + Vec128<int64_t> b) { + const DFromV<decltype(a)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const Half<decltype(dn)> dnh; + + // Negative values are saturated by first saturating their bitwise inverse + // and then inverting the saturation result + const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); + const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); + const auto saturated_a = Xor( + invert_mask_a, + detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); + const auto saturated_b = Xor( + invert_mask_b, + detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); + + return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, + Vec128<int64_t> b) { + const DFromV<decltype(a)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const Half<decltype(dn)> dnh; + + const auto saturated_a = detail::DemoteFromU64Saturate( + dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); + const auto saturated_b = detail::DemoteFromU64Saturate( + dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); + + return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const Half<decltype(dn)> dnh; + + const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); + const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); + + return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); +} + +template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V, + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + return ReorderDemote2To(d, a, b); +} + +template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) { + const RebindToUnsigned<decltype(dbf16)> du16; + return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); +} + +// ------------------------------ Convert i32 <=> f32 (Round) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{wasm_f32x4_convert_i32x4(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + return VFromD<D>{wasm_f32x4_convert_u32x4(v.raw)}; +} +// Truncates (rounds toward zero). +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) { + return VFromD<D>{wasm_i32x4_trunc_sat_f32x4(v.raw)}; +} + +template <size_t N> +HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { + return ConvertTo(RebindToSigned<DFromV<decltype(v)>>(), Round(v)); +} + +// ================================================== MISC + +// ------------------------------ SumsOf8 (ShiftRight, Add) +template <size_t N> +HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> du8; + const RepartitionToWide<decltype(du8)> du16; + const RepartitionToWide<decltype(du16)> du32; + const RepartitionToWide<decltype(du32)> du64; + using VU16 = VFromD<decltype(du16)>; + + const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); + const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); + const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); + + const VU16 szz_FE_zz_BA_zz_76_zz_32 = + BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); + const VU16 sxx_FC_xx_B8_xx_74_xx_30 = + Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); + const VU16 szz_zz_xx_FC_zz_zz_xx_74 = + BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); + const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = + Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); + return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); +} + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { + const RebindToUnsigned<decltype(d)> du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, N=1. + const VFromD<D> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))}; + + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); + + alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask( + d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + return RebindMask( + d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + CopyBytes<(MaxLanes(d) + 7) / 8>(bits, &mask_bits); + return detail::LoadMaskBits(d, mask_bits); +} + +// ------------------------------ Mask + +namespace detail { + +// Full +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128<T> mask) { + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, mask.raw); + + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + const uint64_t lo = ((lanes[0] * kMagic) >> 56); + const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; + return (hi + lo); +} + +// 64-bit +template <typename T> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128<T, 8> mask) { + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) * + kMagic) >> + 56; +} + +// 32-bit or less: need masking +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128<T, N> mask) { + uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)); + // Clear potentially undefined bytes. + bytes &= (1ULL << (N * 8)) - 1; + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + return (bytes * kMagic) >> 56; +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128<T, N> mask) { + // Remove useless lower half of each u16 while preserving the sign bit. + const __i16x8 zero = wasm_i16x8_splat(0); + const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; + return BitsFromMask(hwy::SizeTag<1>(), mask8); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask128<T, N> mask) { + const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); + const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); + alignas(16) uint32_t lanes[4]; + wasm_v128_store(lanes, sliced_mask); + return lanes[0] | lanes[1] | lanes[2] | lanes[3]; +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, + const Mask128<T, N> mask) { + const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); + const __i64x2 slice = wasm_i64x2_make(1, 2); + const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, sliced_mask); + return lanes[0] | lanes[1]; +} + +// Returns the lowest N bits for the BitsFromMask result. +template <typename T, size_t N> +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); +} + +// Returns 0xFF for bytes with index >= N, otherwise 0. +template <size_t N> +constexpr __i8x16 BytesAbove() { + return /**/ + (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) + : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) + : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) + : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) + : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) + : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) + : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) + : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) + : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) + : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1) + : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1) + : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1) + : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1) + : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, + -1, -1, -1) + : (N == 11) + ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) + : (N == 13) + ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) + : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) { + return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask)); +} + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) { + return PopCount(BitsFromMask(tag, m)); +} + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) { + return PopCount(BitsFromMask(tag, m)); +} + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) { + const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, shifted_bits); + return PopCount(lanes[0] | lanes[1]); +} + +template <typename T> +HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) { + alignas(16) int64_t lanes[2]; + wasm_v128_store(lanes, m.raw); + return static_cast<size_t>(-(lanes[0] + lanes[1])); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template <class D> +HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const size_t kNumBytes = (d.MaxLanes() + 7) / 8; + CopyBytes<kNumBytes>(&mask_bits, bits); + return kNumBytes; +} + +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_API size_t CountTrue(D /* tag */, const MFromD<D> m) { + return detail::CountTrue(hwy::SizeTag<sizeof(TFromD<D>)>(), m); +} + +// Partial +template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API size_t CountTrue(D d, MFromD<D> m) { + // Ensure all undefined bytes are 0. + const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()}; + const Full128<T> dfull; + return CountTrue(dfull, Mask128<T>{AndNot(mask, m).raw}); +} + +// Full vector +template <class D, HWY_IF_V_SIZE_D(D, 16)> +HWY_API bool AllFalse(D d, const MFromD<D> m) { + const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m)); + return !wasm_v128_any_true(v8.raw); +} + +// Full vector +namespace detail { +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) { + return wasm_i8x16_all_true(m.raw); +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) { + return wasm_i16x8_all_true(m.raw); +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) { + return wasm_i32x4_all_true(m.raw); +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) { + return wasm_i64x2_all_true(m.raw); +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API bool AllTrue(D /* tag */, const Mask128<T> m) { + return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m); +} + +// Partial vectors + +template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API bool AllFalse(D d, const MFromD<D> m) { + // Ensure all undefined bytes are 0. + const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()}; + return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw}); +} + +template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API bool AllTrue(D d, const MFromD<D> m) { + // Ensure all undefined bytes are FF. + const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()}; + return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw}); +} + +template <class D> +HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD<D> mask) { + const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return Num0BitsBelowLS1Bit_Nonzero32(bits); +} + +template <class D> +HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD<D> mask) { + const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; +} + +template <class D> +HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD<D> mask) { + const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); +} + +template <class D> +HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD<D> mask) { + const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return bits + ? (31 - static_cast<intptr_t>(Num0BitsAboveMS1Bit_Nonzero32(bits))) + : -1; +} + +// ------------------------------ Compress + +namespace detail { + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd<T, N, 0> d; + const Rebind<uint8_t, decltype(d)> d8; + const Simd<uint16_t, N, 0> du; + + // We need byte indices for TableLookupBytes (one vector's worth for each of + // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We + // can instead store lane indices and convert to byte indices (2*lane + 0..1), + // with the doubling baked into the table. Unpacking nibbles is likely more + // costly than the higher cache footprint from storing bytes. + alignas(16) static constexpr uint8_t table[256 * 8] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd<T, N, 0> d; + const Rebind<uint8_t, decltype(d)> d8; + const Simd<uint16_t, N, 0> du; + + // We need byte indices for TableLookupBytes (one vector's worth for each of + // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We + // can instead store lane indices and convert to byte indices (2*lane + 0..1), + // with the doubling baked into the table. Unpacking nibbles is likely more + // costly than the higher cache footprint from storing bytes. + alignas(16) static constexpr uint8_t table[256 * 8] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd<T, N, 0> d; + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +// Helper functions called by both Compress and CompressStore - avoids a +// redundant BitsFromMask in the latter. + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) { + const auto idx = detail::IdxFromBits<T, N>(mask_bits); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) { + const auto idx = detail::IdxFromNotBits<T, N>(mask_bits); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +} // namespace detail + +template <typename T> +struct CompressIsPartition { +#if HWY_TARGET == HWY_WASM_EMU256 + enum { value = 0 }; +#else + enum { value = (sizeof(T) != 1) }; +#endif +}; + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const Full128<T> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 byte lanes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))> +HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { + return detail::Compress(v, detail::BitsFromMask(mask)); +} + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const Full128<T> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 byte lanes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::Compress(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNot(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, + Mask128<uint64_t> /* m */) { + return v; +} + +// ------------------------------ CompressBits +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(v, mask_bits); +} + +// ------------------------------ CompressStore +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const auto c = detail::Compress(v, mask_bits); + StoreU(c, d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ CompressBlendedStore +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16 + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const VFromD<decltype(du)> compressed = + detail::Compress(BitCast(du, v), mask_bits); + const MFromD<D> store_mask = RebindMask(d, FirstN(du, count)); + BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); + return count; +} + +// ------------------------------ CompressBitsStore + +template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + uint64_t mask_bits = 0; + constexpr size_t kN = MaxLanes(d); + CopyBytes<(kN + 7) / 8>(bits, &mask_bits); + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + + const auto c = detail::Compress(v, mask_bits); + StoreU(c, d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ StoreInterleaved2/3/4 + +// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in +// generic_ops-inl.h. + +// ------------------------------ MulEven/Odd (Load) + +HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a, + const Vec128<uint64_t> b) { + alignas(16) uint64_t mul[2]; + mul[0] = + Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)), + static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); + return Load(Full128<uint64_t>(), mul); +} + +HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a, + const Vec128<uint64_t> b) { + alignas(16) uint64_t mul[2]; + mul[0] = + Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)), + static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); + return Load(Full128<uint64_t>(), mul); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +// Generic for all vector lengths. +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { + const Rebind<uint32_t, decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 + // Using shift/and instead of Zip leads to the odd/even order that + // RearrangeToOddPlusEven prefers. + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return Mul(BitCast(df32, ae), BitCast(df32, be)) + Mul(BitCast(df32, ao), BitCast(df32, bo)); +} + +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, + const VFromD<D32> sum0, + VFromD<D32>& sum1) { + const Rebind<uint32_t, decltype(df32)> du32; + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 + // Using shift/and instead of Zip leads to the odd/even order that + // RearrangeToOddPlusEven prefers. + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is +// safe. +template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), + class V16 = VFromD<RepartitionToNarrow<D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { + return VFromD<D32>{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; +} + +// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is +// safe. +template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), + class V16 = VFromD<RepartitionToNarrow<D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, + const VFromD<D32> sum0, + VFromD<D32>& /*sum1*/) { + return sum0 + WidenMulPairwiseAdd(d, a, b); +} + +// ------------------------------ RearrangeToOddPlusEven +template <size_t N> +HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven( + const Vec128<int32_t, N> sum0, const Vec128<int32_t, N> /*sum1*/) { + return sum0; // invariant already holds +} + +template <size_t N> +HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0, + const Vec128<float, N> sum1) { + return Add(sum0, sum1); +} + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template <typename T> +HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, + const Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, + const Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, + const Vec128<T, 1> v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template <typename T> +HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128<T, 2> v10) { + return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw}; +} +template <typename T> +HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128<T, 2> v10) { + return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw}); +} +template <typename T> +HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128<T, 2> v10) { + return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw}); +} + +// N=4 (full) +template <typename T> +HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = v3210 + v1032; + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template <typename T> +HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = Min(v3210, v1032); + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template <typename T> +HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = Max(v3210, v1032); + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// u64/i64/f64: + +// N=2 (full) +template <typename T> +HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return v10 + v01; +} +template <typename T> +HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return Min(v10, v01); +} +template <typename T> +HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return Max(v10, v01); +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128<int16_t, N> v) { + const Simd<int16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128<int16_t, N> v) { + const Simd<int16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128<int16_t, N> v) { + const Simd<int16_t, N, 0> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +} // namespace detail + +// Supported for u/i/f 32/64. Returns the same value in each lane. +template <class D> +HWY_API VFromD<D> SumOfLanes(D /* tag */, const VFromD<D> v) { + return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} +template <class D> +HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) { + return GetLane(SumOfLanes(d, v)); +} +template <class D> +HWY_API VFromD<D> MinOfLanes(D /* tag */, const VFromD<D> v) { + return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} +template <class D> +HWY_API VFromD<D> MaxOfLanes(D /* tag */, const VFromD<D> v) { + return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v); +} + +// ------------------------------ Lt128 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const MFromD<D> eqHL = Eq(a, b); + const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); + // We need to bring cL to the upper lane/bit corresponding to cH. Comparing + // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the + // comparison result leftwards requires only 4. IfThenElse compiles to the + // same code as OrAnd(). + const VFromD<D> ltLx = DupEven(ltHL); + const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL); + return MaskFromVec(DupOdd(outHx)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); + return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); +} + +// ------------------------------ Eq128 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); +} + +// ------------------------------ Ne128 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(Or(Reverse2(d, neHL), neHL)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { + const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(InterleaveUpper(d, neHL, neHL)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Without a native OddEven, it seems infeasible to go faster than Lt128. +template <class D> +HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) { + return IfThenElse(Lt128(d, a, b), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) { + return IfThenElse(Lt128(d, b, a), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template <class D> +HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/wasm_256-inl.h b/third_party/highway/hwy/ops/wasm_256-inl.h new file mode 100644 index 0000000000..1654ae0afb --- /dev/null +++ b/third_party/highway/hwy/ops/wasm_256-inl.h @@ -0,0 +1,2030 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 256-bit WASM vectors and operations. Experimental. +// External include guard in highway.h - see comment there. + +// For half-width vectors. Already includes base.h and shared-inl.h. +#include "hwy/ops/wasm_128-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename T> +class Vec256 { + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec256& operator*=(const Vec256 other) { + return *this = (*this * other); + } + HWY_INLINE Vec256& operator/=(const Vec256 other) { + return *this = (*this / other); + } + HWY_INLINE Vec256& operator+=(const Vec256 other) { + return *this = (*this + other); + } + HWY_INLINE Vec256& operator-=(const Vec256 other) { + return *this = (*this - other); + } + HWY_INLINE Vec256& operator&=(const Vec256 other) { + return *this = (*this & other); + } + HWY_INLINE Vec256& operator|=(const Vec256 other) { + return *this = (*this | other); + } + HWY_INLINE Vec256& operator^=(const Vec256 other) { + return *this = (*this ^ other); + } + + Vec128<T> v0; + Vec128<T> v1; +}; + +template <typename T> +struct Mask256 { + Mask128<T> m0; + Mask128<T> m1; +}; + +// ------------------------------ Zero + +// Avoid VFromD here because it is defined in terms of Zero. +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<TFromD<D>> Zero(D d) { + const Half<decltype(d)> dh; + Vec256<TFromD<D>> ret; + ret.v0 = ret.v1 = Zero(dh); + return ret; +} + +// ------------------------------ BitCast +template <class D, typename TFrom> +HWY_API VFromD<D> BitCast(D d, Vec256<TFrom> v) { + const Half<decltype(d)> dh; + VFromD<D> ret; + ret.v0 = BitCast(dh, v.v0); + ret.v1 = BitCast(dh, v.v1); + return ret; +} + +// ------------------------------ ResizeBitCast + +// 32-byte vector to 32-byte vector: Same as BitCast +template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32), + HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, v); +} + +// <= 16-byte vector to 32-byte vector +template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), + HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const Half<decltype(d)> dh; + VFromD<D> ret; + ret.v0 = ResizeBitCast(dh, v); + ret.v1 = Zero(dh); + return ret; +} + +// 32-byte vector to <= 16-byte vector +template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32), + HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return ResizeBitCast(d, v.v0); +} + +// ------------------------------ Set +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2> +HWY_API VFromD<D> Set(D d, const T2 t) { + const Half<decltype(d)> dh; + VFromD<D> ret; + ret.v0 = ret.v1 = Set(dh, static_cast<TFromD<D>>(t)); + return ret; +} + +// Undefined, Iota defined in wasm_128. + +// ================================================== ARITHMETIC + +template <typename T> +HWY_API Vec256<T> operator+(Vec256<T> a, const Vec256<T> b) { + a.v0 += b.v0; + a.v1 += b.v1; + return a; +} + +template <typename T> +HWY_API Vec256<T> operator-(Vec256<T> a, const Vec256<T> b) { + a.v0 -= b.v0; + a.v1 -= b.v1; + return a; +} + +// ------------------------------ SumsOf8 +HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) { + Vec256<uint64_t> ret; + ret.v0 = SumsOf8(v.v0); + ret.v1 = SumsOf8(v.v1); + return ret; +} + +template <typename T> +HWY_API Vec256<T> SaturatedAdd(Vec256<T> a, const Vec256<T> b) { + a.v0 = SaturatedAdd(a.v0, b.v0); + a.v1 = SaturatedAdd(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> SaturatedSub(Vec256<T> a, const Vec256<T> b) { + a.v0 = SaturatedSub(a.v0, b.v0); + a.v1 = SaturatedSub(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> AverageRound(Vec256<T> a, const Vec256<T> b) { + a.v0 = AverageRound(a.v0, b.v0); + a.v1 = AverageRound(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> Abs(Vec256<T> v) { + v.v0 = Abs(v.v0); + v.v1 = Abs(v.v1); + return v; +} + +// ------------------------------ Shift lanes by constant #bits + +template <int kBits, typename T> +HWY_API Vec256<T> ShiftLeft(Vec256<T> v) { + v.v0 = ShiftLeft<kBits>(v.v0); + v.v1 = ShiftLeft<kBits>(v.v1); + return v; +} + +template <int kBits, typename T> +HWY_API Vec256<T> ShiftRight(Vec256<T> v) { + v.v0 = ShiftRight<kBits>(v.v0); + v.v1 = ShiftRight<kBits>(v.v1); + return v; +} + +// ------------------------------ RotateRight (ShiftRight, Or) +template <int kBits, typename T> +HWY_API Vec256<T> RotateRight(const Vec256<T> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v)); +} + +// ------------------------------ Shift lanes by same variable #bits + +template <typename T> +HWY_API Vec256<T> ShiftLeftSame(Vec256<T> v, const int bits) { + v.v0 = ShiftLeftSame(v.v0, bits); + v.v1 = ShiftLeftSame(v.v1, bits); + return v; +} + +template <typename T> +HWY_API Vec256<T> ShiftRightSame(Vec256<T> v, const int bits) { + v.v0 = ShiftRightSame(v.v0, bits); + v.v1 = ShiftRightSame(v.v1, bits); + return v; +} + +// ------------------------------ Min, Max +template <typename T> +HWY_API Vec256<T> Min(Vec256<T> a, const Vec256<T> b) { + a.v0 = Min(a.v0, b.v0); + a.v1 = Min(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> Max(Vec256<T> a, const Vec256<T> b) { + a.v0 = Max(a.v0, b.v0); + a.v1 = Max(a.v1, b.v1); + return a; +} +// ------------------------------ Integer multiplication + +template <typename T> +HWY_API Vec256<T> operator*(Vec256<T> a, const Vec256<T> b) { + a.v0 *= b.v0; + a.v1 *= b.v1; + return a; +} + +template <typename T> +HWY_API Vec256<T> MulHigh(Vec256<T> a, const Vec256<T> b) { + a.v0 = MulHigh(a.v0, b.v0); + a.v1 = MulHigh(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> MulFixedPoint15(Vec256<T> a, const Vec256<T> b) { + a.v0 = MulFixedPoint15(a.v0, b.v0); + a.v1 = MulFixedPoint15(a.v1, b.v1); + return a; +} + +// Cannot use MakeWide because that returns uint128_t for uint64_t, but we want +// uint64_t. +HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, const Vec256<uint32_t> b) { + Vec256<uint64_t> ret; + ret.v0 = MulEven(a.v0, b.v0); + ret.v1 = MulEven(a.v1, b.v1); + return ret; +} +HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, const Vec256<int32_t> b) { + Vec256<int64_t> ret; + ret.v0 = MulEven(a.v0, b.v0); + ret.v1 = MulEven(a.v1, b.v1); + return ret; +} + +HWY_API Vec256<uint64_t> MulEven(Vec256<uint64_t> a, const Vec256<uint64_t> b) { + Vec256<uint64_t> ret; + ret.v0 = MulEven(a.v0, b.v0); + ret.v1 = MulEven(a.v1, b.v1); + return ret; +} +HWY_API Vec256<uint64_t> MulOdd(Vec256<uint64_t> a, const Vec256<uint64_t> b) { + Vec256<uint64_t> ret; + ret.v0 = MulOdd(a.v0, b.v0); + ret.v1 = MulOdd(a.v1, b.v1); + return ret; +} + +// ------------------------------ Negate +template <typename T> +HWY_API Vec256<T> Neg(Vec256<T> v) { + v.v0 = Neg(v.v0); + v.v1 = Neg(v.v1); + return v; +} + +// ------------------------------ Floating-point division +template <typename T> +HWY_API Vec256<T> operator/(Vec256<T> a, const Vec256<T> b) { + a.v0 /= b.v0; + a.v1 /= b.v1; + return a; +} + +// Approximate reciprocal +HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) { + const Vec256<float> one = Set(Full256<float>(), 1.0f); + return one / v; +} + +// Absolute value of difference. +HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x, + Vec256<float> add) { + mul.v0 = MulAdd(mul.v0, x.v0, add.v0); + mul.v1 = MulAdd(mul.v1, x.v1, add.v1); + return mul; +} + +HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x, + Vec256<float> add) { + mul.v0 = NegMulAdd(mul.v0, x.v0, add.v0); + mul.v1 = NegMulAdd(mul.v1, x.v1, add.v1); + return mul; +} + +HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x, + Vec256<float> sub) { + mul.v0 = MulSub(mul.v0, x.v0, sub.v0); + mul.v1 = MulSub(mul.v1, x.v1, sub.v1); + return mul; +} + +HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x, + Vec256<float> sub) { + mul.v0 = NegMulSub(mul.v0, x.v0, sub.v0); + mul.v1 = NegMulSub(mul.v1, x.v1, sub.v1); + return mul; +} + +// ------------------------------ Floating-point square root + +template <typename T> +HWY_API Vec256<T> Sqrt(Vec256<T> v) { + v.v0 = Sqrt(v.v0); + v.v1 = Sqrt(v.v1); + return v; +} + +// Approximate reciprocal square root +HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) { + // TODO(eustas): find cheaper a way to calculate this. + const Vec256<float> one = Set(Full256<float>(), 1.0f); + return one / Sqrt(v); +} + +// ------------------------------ Floating-point rounding + +// Toward nearest integer, ties to even +HWY_API Vec256<float> Round(Vec256<float> v) { + v.v0 = Round(v.v0); + v.v1 = Round(v.v1); + return v; +} + +// Toward zero, aka truncate +HWY_API Vec256<float> Trunc(Vec256<float> v) { + v.v0 = Trunc(v.v0); + v.v1 = Trunc(v.v1); + return v; +} + +// Toward +infinity, aka ceiling +HWY_API Vec256<float> Ceil(Vec256<float> v) { + v.v0 = Ceil(v.v0); + v.v1 = Ceil(v.v1); + return v; +} + +// Toward -infinity, aka floor +HWY_API Vec256<float> Floor(Vec256<float> v) { + v.v0 = Floor(v.v0); + v.v1 = Floor(v.v1); + return v; +} + +// ------------------------------ Floating-point classification + +template <typename T> +HWY_API Mask256<T> IsNaN(const Vec256<T> v) { + return v != v; +} + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Mask256<T> IsInf(const Vec256<T> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>()))); +} + +// Returns whether normal/subnormal/zero. +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Mask256<T> IsFinite(const Vec256<T> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + const VFromD<decltype(du)> vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template <class DTo, typename TFrom, typename TTo = TFromD<DTo>> +HWY_API MFromD<DTo> RebindMask(DTo /*tag*/, Mask256<TFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return MFromD<DTo>{Mask128<TTo>{m.m0.raw}, Mask128<TTo>{m.m1.raw}}; +} + +template <typename T> +HWY_API Mask256<T> TestBit(Vec256<T> v, Vec256<T> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +template <typename T> +HWY_API Mask256<T> operator==(Vec256<T> a, const Vec256<T> b) { + Mask256<T> m; + m.m0 = operator==(a.v0, b.v0); + m.m1 = operator==(a.v1, b.v1); + return m; +} + +template <typename T> +HWY_API Mask256<T> operator!=(Vec256<T> a, const Vec256<T> b) { + Mask256<T> m; + m.m0 = operator!=(a.v0, b.v0); + m.m1 = operator!=(a.v1, b.v1); + return m; +} + +template <typename T> +HWY_API Mask256<T> operator<(Vec256<T> a, const Vec256<T> b) { + Mask256<T> m; + m.m0 = operator<(a.v0, b.v0); + m.m1 = operator<(a.v1, b.v1); + return m; +} + +template <typename T> +HWY_API Mask256<T> operator>(Vec256<T> a, const Vec256<T> b) { + Mask256<T> m; + m.m0 = operator>(a.v0, b.v0); + m.m1 = operator>(a.v1, b.v1); + return m; +} + +template <typename T> +HWY_API Mask256<T> operator<=(Vec256<T> a, const Vec256<T> b) { + Mask256<T> m; + m.m0 = operator<=(a.v0, b.v0); + m.m1 = operator<=(a.v1, b.v1); + return m; +} + +template <typename T> +HWY_API Mask256<T> operator>=(Vec256<T> a, const Vec256<T> b) { + Mask256<T> m; + m.m0 = operator>=(a.v0, b.v0); + m.m1 = operator>=(a.v1, b.v1); + return m; +} + +// ------------------------------ FirstN (Iota, Lt) + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API MFromD<D> FirstN(const D d, size_t num) { + const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper. + using TI = TFromD<decltype(di)>; + return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num))); +} + +// ================================================== LOGICAL + +template <typename T> +HWY_API Vec256<T> Not(Vec256<T> v) { + v.v0 = Not(v.v0); + v.v1 = Not(v.v1); + return v; +} + +template <typename T> +HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) { + a.v0 = And(a.v0, b.v0); + a.v1 = And(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) { + not_mask.v0 = AndNot(not_mask.v0, mask.v0); + not_mask.v1 = AndNot(not_mask.v1, mask.v1); + return not_mask; +} + +template <typename T> +HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) { + a.v0 = Or(a.v0, b.v0); + a.v1 = Or(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) { + a.v0 = Xor(a.v0, b.v0); + a.v1 = Xor(a.v1, b.v1); + return a; +} + +template <typename T> +HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) { + return Xor(x1, Xor(x2, x3)); +} + +template <typename T> +HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) { + return Or(o1, Or(o2, o3)); +} + +template <typename T> +HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) { + return Or(o, And(a1, a2)); +} + +template <typename T> +HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T> +HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) { + return And(a, b); +} + +template <typename T> +HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) { + return Or(a, b); +} + +template <typename T> +HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) { + return Xor(a, b); +} + +// ------------------------------ CopySign + +template <typename T> +HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + const auto msb = SignBit(DFromV<decltype(magn)>()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template <typename T> +HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(DFromV<decltype(sign)>()), sign)); +} + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template <typename T> +HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) { + Mask256<T> m; + m.m0 = MaskFromVec(v.v0); + m.m1 = MaskFromVec(v.v1); + return m; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> VecFromMask(D d, Mask256<T> m) { + const Half<decltype(d)> dh; + Vec256<T> v; + v.v0 = VecFromMask(dh, m.m0); + v.v1 = VecFromMask(dh, m.m1); + return v; +} + +// mask ? yes : no +template <typename T> +HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) { + yes.v0 = IfThenElse(mask.m0, yes.v0, no.v0); + yes.v1 = IfThenElse(mask.m1, yes.v1, no.v1); + return yes; +} + +// mask ? yes : 0 +template <typename T> +HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) { + return yes & VecFromMask(DFromV<decltype(yes)>(), mask); +} + +// mask ? 0 : no +template <typename T> +HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) { + return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); +} + +template <typename T> +HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { + v.v0 = IfNegativeThenElse(v.v0, yes.v0, no.v0); + v.v1 = IfNegativeThenElse(v.v1, yes.v1, no.v1); + return v; +} + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) { + return IfThenZeroElse(v < Zero(DFromV<decltype(v)>()), v); +} + +// ------------------------------ Mask logical + +template <typename T> +HWY_API Mask256<T> Not(const Mask256<T> m) { + return MaskFromVec(Not(VecFromMask(Full256<T>(), m))); +} + +template <typename T> +HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ------------------------------ Shl (BroadcastSignBit, IfThenElse) +template <typename T> +HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) { + v.v0 = operator<<(v.v0, bits.v0); + v.v1 = operator<<(v.v1, bits.v1); + return v; +} + +// ------------------------------ Shr (BroadcastSignBit, IfThenElse) +template <typename T> +HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) { + v.v0 = operator>>(v.v0, bits.v0); + v.v1 = operator>>(v.v1, bits.v1); + return v; +} + +// ------------------------------ BroadcastSignBit (compare, VecFromMask) + +template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec256<T> BroadcastSignBit(const Vec256<T> v) { + return ShiftRight<sizeof(T) * 8 - 1>(v); +} +HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) { + const DFromV<decltype(v)> d; + return VecFromMask(d, v < Zero(d)); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) { + const Half<decltype(d)> dh; + VFromD<D> ret; + ret.v0 = Load(dh, aligned); + ret.v1 = Load(dh, aligned + Lanes(dh)); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D d, const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> MaskedLoadOr(Vec256<T> v, Mask256<T> m, D d, + const T* HWY_RESTRICT aligned) { + return IfThenElse(m, Load(d, aligned), v); +} + +// LoadU == Load. +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { + return Load(d, p); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { + const Half<decltype(d)> dh; + VFromD<D> ret; + ret.v0 = ret.v1 = Load(dh, p); + return ret; +} + +// ------------------------------ Store + +template <class D, typename T = TFromD<D>> +HWY_API void Store(Vec256<T> v, D d, T* HWY_RESTRICT aligned) { + const Half<decltype(d)> dh; + Store(v.v0, dh, aligned); + Store(v.v1, dh, aligned + Lanes(dh)); +} + +// StoreU == Store. +template <class D, typename T = TFromD<D>> +HWY_API void StoreU(Vec256<T> v, D d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +template <class D, typename T = TFromD<D>> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D d, T* HWY_RESTRICT p) { + StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); +} + +// ------------------------------ Stream +template <class D, typename T = TFromD<D>> +HWY_API void Stream(Vec256<T> v, D d, T* HWY_RESTRICT aligned) { + // Same as aligned stores. + Store(v, d, aligned); +} + +// ------------------------------ Scatter, Gather defined in wasm_128 + +// ================================================== SWIZZLE + +// ------------------------------ ExtractLane +template <typename T> +HWY_API T ExtractLane(const Vec256<T> v, size_t i) { + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane +template <typename T> +HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) { + DFromV<decltype(v)> d; + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ------------------------------ LowerHalf + +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> LowerHalf(D /* tag */, Vec256<T> v) { + return v.v0; +} + +template <typename T> +HWY_API Vec128<T> LowerHalf(Vec256<T> v) { + return v.v0; +} + +// ------------------------------ GetLane (LowerHalf) +template <typename T> +HWY_API T GetLane(const Vec256<T> v) { + return GetLane(LowerHalf(v)); +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftLeftBytes(D d, Vec256<T> v) { + const Half<decltype(d)> dh; + v.v0 = ShiftLeftBytes<kBytes>(dh, v.v0); + v.v1 = ShiftLeftBytes<kBytes>(dh, v.v1); + return v; +} + +template <int kBytes, typename T> +HWY_API Vec256<T> ShiftLeftBytes(Vec256<T> v) { + return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftLeftLanes + +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftLeftLanes(D d, const Vec256<T> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); +} + +template <int kLanes, typename T> +HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) { + return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftRightBytes(D d, Vec256<T> v) { + const Half<decltype(d)> dh; + v.v0 = ShiftRightBytes<kBytes>(dh, v.v0); + v.v1 = ShiftRightBytes<kBytes>(dh, v.v1); + return v; +} + +// ------------------------------ ShiftRightLanes +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftRightLanes(D d, const Vec256<T> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> UpperHalf(D /* tag */, const Vec256<T> v) { + return v.v1; +} + +// ------------------------------ CombineShiftRightBytes + +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> CombineShiftRightBytes(D d, Vec256<T> hi, Vec256<T> lo) { + const Half<decltype(d)> dh; + hi.v0 = CombineShiftRightBytes<kBytes>(dh, hi.v0, lo.v0); + hi.v1 = CombineShiftRightBytes<kBytes>(dh, hi.v1, lo.v1); + return hi; +} + +// ------------------------------ Broadcast/splat any lane + +template <int kLane, typename T> +HWY_API Vec256<T> Broadcast(const Vec256<T> v) { + Vec256<T> ret; + ret.v0 = Broadcast<kLane>(v.v0); + ret.v1 = Broadcast<kLane>(v.v1); + return ret; +} + +// ------------------------------ TableLookupBytes + +// Both full +template <typename T, typename TI> +HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes, Vec256<TI> from) { + from.v0 = TableLookupBytes(bytes.v0, from.v0); + from.v1 = TableLookupBytes(bytes.v1, from.v1); + return from; +} + +// Partial index vector +template <typename T, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, + const Vec128<TI, NI> from) { + // First expand to full 128, then 256. + const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw}); + const auto tbl_full = TableLookupBytes(bytes, from_256); + // Shrink to 128, then partial. + return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw}; +} + +// Partial table vector +template <typename T, size_t N, typename TI> +HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, const Vec256<TI> from) { + // First expand to full 128, then 256. + const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw}); + return TableLookupBytes(bytes_256, from); +} + +// Partial both are handled by wasm_128. + +template <class V, class VI> +HWY_API VI TableLookupBytesOr0(V bytes, VI from) { + // wasm out-of-bounds policy already zeros, so TableLookupBytes is fine. + return TableLookupBytes(bytes, from); +} + +// ------------------------------ Hard-coded shuffles + +template <typename T> +HWY_API Vec256<T> Shuffle01(Vec256<T> v) { + v.v0 = Shuffle01(v.v0); + v.v1 = Shuffle01(v.v1); + return v; +} + +template <typename T> +HWY_API Vec256<T> Shuffle2301(Vec256<T> v) { + v.v0 = Shuffle2301(v.v0); + v.v1 = Shuffle2301(v.v1); + return v; +} + +template <typename T> +HWY_API Vec256<T> Shuffle1032(Vec256<T> v) { + v.v0 = Shuffle1032(v.v0); + v.v1 = Shuffle1032(v.v1); + return v; +} + +template <typename T> +HWY_API Vec256<T> Shuffle0321(Vec256<T> v) { + v.v0 = Shuffle0321(v.v0); + v.v1 = Shuffle0321(v.v1); + return v; +} + +template <typename T> +HWY_API Vec256<T> Shuffle2103(Vec256<T> v) { + v.v0 = Shuffle2103(v.v0); + v.v1 = Shuffle2103(v.v1); + return v; +} + +template <typename T> +HWY_API Vec256<T> Shuffle0123(Vec256<T> v) { + v.v0 = Shuffle0123(v.v0); + v.v1 = Shuffle0123(v.v1); + return v; +} + +// Used by generic_ops-inl.h +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ShuffleTwo2301(Vec256<T> a, const Vec256<T> b) { + a.v0 = ShuffleTwo2301(a.v0, b.v0); + a.v1 = ShuffleTwo2301(a.v1, b.v1); + return a; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ShuffleTwo1230(Vec256<T> a, const Vec256<T> b) { + a.v0 = ShuffleTwo1230(a.v0, b.v0); + a.v1 = ShuffleTwo1230(a.v1, b.v1); + return a; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ShuffleTwo3012(Vec256<T> a, const Vec256<T> b) { + a.v0 = ShuffleTwo3012(a.v0, b.v0); + a.v1 = ShuffleTwo3012(a.v1, b.v1); + return a; +} + +} // namespace detail + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template <typename T> +struct Indices256 { + __v128_u i0; + __v128_u i1; +}; + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API Indices256<T> IndicesFromVec(D /* tag */, Vec256<TI> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); + Indices256<T> ret; + ret.i0 = vec.v0.raw; + ret.i1 = vec.v1.raw; + return ret; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI> +HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) { + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T> +HWY_API Vec256<T> TableLookupLanes(const Vec256<T> v, Indices256<T> idx) { + const DFromV<decltype(v)> d; + const Half<decltype(d)> dh; + const auto idx_i0 = IndicesFromVec(dh, Vec128<T>{idx.i0}); + const auto idx_i1 = IndicesFromVec(dh, Vec128<T>{idx.i1}); + + Vec256<T> result; + result.v0 = TwoTablesLookupLanes(v.v0, v.v1, idx_i0); + result.v1 = TwoTablesLookupLanes(v.v0, v.v1, idx_i1); + return result; +} + +template <typename T> +HWY_API Vec256<T> TableLookupLanesOr0(Vec256<T> v, Indices256<T> idx) { + // The out of bounds behavior will already zero lanes. + return TableLookupLanesOr0(v, idx); +} + +template <typename T> +HWY_API Vec256<T> TwoTablesLookupLanes(const Vec256<T> a, const Vec256<T> b, + Indices256<T> idx) { + const DFromV<decltype(a)> d; + const Half<decltype(d)> dh; + const RebindToUnsigned<decltype(d)> du; + using TU = MakeUnsigned<T>; + constexpr size_t kLanesPerVect = 32 / sizeof(TU); + + Vec256<TU> vi; + vi.v0 = Vec128<TU>{idx.i0}; + vi.v1 = Vec128<TU>{idx.i1}; + const auto vmod = vi & Set(du, TU{kLanesPerVect - 1}); + const auto is_lo = RebindMask(d, vi == vmod); + + const auto idx_i0 = IndicesFromVec(dh, vmod.v0); + const auto idx_i1 = IndicesFromVec(dh, vmod.v1); + + Vec256<T> result_lo; + Vec256<T> result_hi; + result_lo.v0 = TwoTablesLookupLanes(a.v0, a.v1, idx_i0); + result_lo.v1 = TwoTablesLookupLanes(a.v0, a.v1, idx_i1); + result_hi.v0 = TwoTablesLookupLanes(b.v0, b.v1, idx_i0); + result_hi.v1 = TwoTablesLookupLanes(b.v0, b.v1, idx_i1); + return IfThenElse(is_lo, result_lo, result_hi); +} + +// ------------------------------ Reverse +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v1 = Reverse(dh, v.v0); // note reversed v1 member order + ret.v0 = Reverse(dh, v.v1); + return ret; +} + +// ------------------------------ Reverse2 +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> Reverse2(D d, Vec256<T> v) { + const Half<decltype(d)> dh; + v.v0 = Reverse2(dh, v.v0); + v.v1 = Reverse2(dh, v.v1); + return v; +} + +// ------------------------------ Reverse4 + +// Each block has only 2 lanes, so swap blocks and their lanes. +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Reverse4(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = Reverse2(dh, v.v1); // swapped + ret.v1 = Reverse2(dh, v.v0); + return ret; +} + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 8)> +HWY_API Vec256<T> Reverse4(D d, Vec256<T> v) { + const Half<decltype(d)> dh; + v.v0 = Reverse4(dh, v.v0); + v.v1 = Reverse4(dh, v.v1); + return v; +} + +// ------------------------------ Reverse8 + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Reverse8(D /* tag */, Vec256<T> /* v */) { + HWY_ASSERT(0); // don't have 8 u64 lanes +} + +// Each block has only 4 lanes, so swap blocks and their lanes. +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> Reverse8(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = Reverse4(dh, v.v1); // swapped + ret.v1 = Reverse4(dh, v.v0); + return ret; +} + +template <class D, typename T = TFromD<D>, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API Vec256<T> Reverse8(D d, Vec256<T> v) { + const Half<decltype(d)> dh; + v.v0 = Reverse8(dh, v.v0); + v.v1 = Reverse8(dh, v.v1); + return v; +} + +// ------------------------------ InterleaveLower + +template <typename T> +HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) { + a.v0 = InterleaveLower(a.v0, b.v0); + a.v1 = InterleaveLower(a.v1, b.v1); + return a; +} + +// wasm_128 already defines a template with D, V, V args. + +// ------------------------------ InterleaveUpper (UpperHalf) + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> InterleaveUpper(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + a.v0 = InterleaveUpper(dh, a.v0, b.v0); + a.v1 = InterleaveUpper(dh, a.v1, b.v1); + return a; +} + +// ------------------------------ ZipLower/ZipUpper defined in wasm_128 + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> Combine(D /* d */, Vec128<T> hi, Vec128<T> lo) { + Vec256<T> ret; + ret.v1 = hi; + ret.v0 = lo; + return ret; +} + +// ------------------------------ ZeroExtendVector (Combine) +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ZeroExtendVector(D d, Vec128<T> lo) { + const Half<decltype(d)> dh; + return Combine(d, Zero(dh), lo); +} + +// ------------------------------ ZeroExtendResizeBitCast + +namespace detail { + +template <size_t kFromVectSize, class DTo, class DFrom, + HWY_IF_LANES_LE(kFromVectSize, 8)> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<kFromVectSize> /* from_size_tag */, + hwy::SizeTag<32> /* to_size_tag */, DTo d_to, DFrom d_from, + VFromD<DFrom> v) { + const Half<decltype(d_to)> dh_to; + return ZeroExtendVector(d_to, ZeroExtendResizeBitCast(dh_to, d_from, v)); +} + +} // namespace detail + +// ------------------------------ ConcatLowerLower +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatLowerLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + Vec256<T> ret; + ret.v1 = hi.v0; + ret.v0 = lo.v0; + return ret; +} + +// ------------------------------ ConcatUpperUpper +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatUpperUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + Vec256<T> ret; + ret.v1 = hi.v1; + ret.v0 = lo.v1; + return ret; +} + +// ------------------------------ ConcatLowerUpper +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatLowerUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + Vec256<T> ret; + ret.v1 = hi.v0; + ret.v0 = lo.v1; + return ret; +} + +// ------------------------------ ConcatUpperLower +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatUpperLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + Vec256<T> ret; + ret.v1 = hi.v1; + ret.v0 = lo.v0; + return ret; +} + +// ------------------------------ ConcatOdd +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = ConcatOdd(dh, lo.v1, lo.v0); + ret.v1 = ConcatOdd(dh, hi.v1, hi.v0); + return ret; +} + +// ------------------------------ ConcatEven +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = ConcatEven(dh, lo.v1, lo.v0); + ret.v1 = ConcatEven(dh, hi.v1, hi.v0); + return ret; +} + +// ------------------------------ DupEven +template <typename T> +HWY_API Vec256<T> DupEven(Vec256<T> v) { + v.v0 = DupEven(v.v0); + v.v1 = DupEven(v.v1); + return v; +} + +// ------------------------------ DupOdd +template <typename T> +HWY_API Vec256<T> DupOdd(Vec256<T> v) { + v.v0 = DupOdd(v.v0); + v.v1 = DupOdd(v.v1); + return v; +} + +// ------------------------------ OddEven +template <typename T> +HWY_API Vec256<T> OddEven(Vec256<T> a, const Vec256<T> b) { + a.v0 = OddEven(a.v0, b.v0); + a.v1 = OddEven(a.v1, b.v1); + return a; +} + +// ------------------------------ OddEvenBlocks +template <typename T> +HWY_API Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) { + odd.v0 = even.v0; + return odd; +} + +// ------------------------------ SwapAdjacentBlocks +template <typename T> +HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) { + Vec256<T> ret; + ret.v0 = v.v1; // swapped order + ret.v1 = v.v0; + return ret; +} + +// ------------------------------ ReverseBlocks +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ReverseBlocks(D /* tag */, const Vec256<T> v) { + return SwapAdjacentBlocks(v); // 2 blocks, so Swap = Reverse +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +namespace detail { + +// Unsigned: zero-extend. +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) { + return Vec128<uint16_t>{wasm_u16x8_extend_high_u8x16(v.raw)}; +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) { + return Vec128<uint32_t>{ + wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(v.raw))}; +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) { + return Vec128<int16_t>{wasm_u16x8_extend_high_u8x16(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) { + return Vec128<int32_t>{ + wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(v.raw))}; +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> PromoteUpperTo(D /* tag */, Vec128<uint16_t> v) { + return Vec128<uint32_t>{wasm_u32x4_extend_high_u16x8(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec128<uint64_t> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) { + return Vec128<uint64_t>{wasm_u64x2_extend_high_u32x4(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<uint16_t> v) { + return Vec128<int32_t>{wasm_u32x4_extend_high_u16x8(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) { + return Vec128<int64_t>{wasm_u64x2_extend_high_u32x4(v.raw)}; +} + +// Signed: replicate sign bit. +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> PromoteUpperTo(D /* tag */, Vec128<int8_t> v) { + return Vec128<int16_t>{wasm_i16x8_extend_high_i8x16(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<int8_t> v) { + return Vec128<int32_t>{ + wasm_i32x4_extend_high_i16x8(wasm_i16x8_extend_high_i8x16(v.raw))}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<int16_t> v) { + return Vec128<int32_t>{wasm_i32x4_extend_high_i16x8(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) { + return Vec128<int64_t>{wasm_i64x2_extend_high_i32x4(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec128<double> PromoteUpperTo(D dd, Vec128<int32_t> v) { + // There is no wasm_f64x2_convert_high_i32x4. + const Full64<int32_t> di32h; + return PromoteTo(dd, UpperHalf(di32h, v)); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> PromoteUpperTo(D df32, Vec128<float16_t> v) { + const RebindToSigned<decltype(df32)> di32; + const RebindToUnsigned<decltype(df32)> du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteUpperTo(du32, Vec128<uint16_t>{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> PromoteUpperTo(D df32, Vec128<bfloat16_t> v) { + const Full128<uint16_t> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename TN, + HWY_IF_T_SIZE_D(D, sizeof(TN) * 2)> +HWY_API VFromD<D> PromoteTo(D d, Vec128<TN> v) { + const Half<decltype(d)> dh; + VFromD<D> ret; + ret.v0 = PromoteTo(dh, LowerHalf(v)); + ret.v1 = detail::PromoteUpperTo(dh, v); + return ret; +} + +// 4x promotion: 8-bit to 32-bit or 16-bit to 64-bit +template <class DW, HWY_IF_V_SIZE_D(DW, 32), + HWY_IF_T_SIZE_ONE_OF_D(DW, (1 << 4) | (1 << 8)), + HWY_IF_NOT_FLOAT_D(DW), typename TN, + HWY_IF_T_SIZE_D(DW, sizeof(TN) * 4), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TN)> +HWY_API Vec256<TFromD<DW>> PromoteTo(DW d, Vec64<TN> v) { + const Half<decltype(d)> dh; + // 16-bit lanes for UI8->UI32, 32-bit lanes for UI16->UI64 + const Rebind<MakeWide<TN>, decltype(d)> d2; + const auto v_2x = PromoteTo(d2, v); + Vec256<TFromD<DW>> ret; + ret.v0 = PromoteTo(dh, LowerHalf(v_2x)); + ret.v1 = detail::PromoteUpperTo(dh, v_2x); + return ret; +} + +// 8x promotion: 8-bit to 64-bit +template <class DW, HWY_IF_V_SIZE_D(DW, 32), HWY_IF_T_SIZE_D(DW, 8), + HWY_IF_NOT_FLOAT_D(DW), typename TN, HWY_IF_T_SIZE(TN, 1)> +HWY_API Vec256<TFromD<DW>> PromoteTo(DW d, Vec32<TN> v) { + const Half<decltype(d)> dh; + const Repartition<MakeWide<MakeWide<TN>>, decltype(dh)> d4; // 32-bit lanes + const auto v32 = PromoteTo(d4, v); + Vec256<TFromD<DW>> ret; + ret.v0 = PromoteTo(dh, LowerHalf(v32)); + ret.v1 = detail::PromoteUpperTo(dh, v32); + return ret; +} + +// ------------------------------ DemoteTo + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.v0.raw, v.v1.raw)}; +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw); + return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) { + return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.v0.raw, v.v1.raw)}; +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw); + return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) { + return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.v0.raw, v.v1.raw)}; +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> DemoteTo(D di, Vec256<double> v) { + const Vec64<int32_t> lo{wasm_i32x4_trunc_sat_f64x2_zero(v.v0.raw)}; + const Vec64<int32_t> hi{wasm_i32x4_trunc_sat_f64x2_zero(v.v1.raw)}; + return Combine(di, hi, lo); +} + +template <class D, HWY_IF_F16_D(D)> +HWY_API Vec128<float16_t> DemoteTo(D d16, Vec256<float> v) { + const Half<decltype(d16)> d16h; + const Vec64<float16_t> lo = DemoteTo(d16h, v.v0); + const Vec64<float16_t> hi = DemoteTo(d16h, v.v1); + return Combine(d16, hi, lo); +} + +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec128<bfloat16_t> DemoteTo(D dbf16, Vec256<float> v) { + const Half<decltype(dbf16)> dbf16h; + const Vec64<bfloat16_t> lo = DemoteTo(dbf16h, v.v0); + const Vec64<bfloat16_t> hi = DemoteTo(dbf16h, v.v1); + return Combine(dbf16, hi, lo); +} + +// For already range-limited input [0, 255]. +HWY_API Vec64<uint8_t> U8FromU32(Vec256<uint32_t> v) { + const Full64<uint8_t> du8; + const Full256<int32_t> di32; // no unsigned DemoteTo + return DemoteTo(du8, BitCast(di32, v)); +} + +// ------------------------------ Truncations + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec32<uint8_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { + return Vec32<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, + 24)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { + return Vec64<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9, 16, + 17, 24, 25, 0, 1, 8, 9, 16, 17, 24, + 25)}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { + return Vec128<uint32_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3, 8, + 9, 10, 11, 16, 17, 18, 19, 24, 25, + 26, 27)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) { + return Vec64<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12, 16, + 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, + 28)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) { + return Vec128<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5, 8, + 9, 12, 13, 16, 17, 20, 21, 24, 25, + 28, 29)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> TruncateTo(D /* tag */, Vec256<uint16_t> v) { + return Vec128<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6, 8, + 10, 12, 14, 16, 18, 20, 22, 24, 26, + 28, 30)}; +} + +// ------------------------------ ReorderDemote2To +template <class DBF16, HWY_IF_BF16_D(DBF16)> +HWY_API Vec256<bfloat16_t> ReorderDemote2To(DBF16 dbf16, Vec256<float> a, + Vec256<float> b) { + const RebindToUnsigned<decltype(dbf16)> du16; + return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); +} + +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), HWY_IF_SIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Half<decltype(dn)> dnh; + VFromD<DN> demoted; + demoted.v0 = DemoteTo(dnh, a); + demoted.v1 = DemoteTo(dnh, b); + return demoted; +} + +template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32), HWY_IF_UNSIGNED_D(DN), + HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> +HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { + const Half<decltype(dn)> dnh; + VFromD<DN> demoted; + demoted.v0 = DemoteTo(dnh, a); + demoted.v1 = DemoteTo(dnh, b); + return demoted; +} + +// ------------------------------ Convert i32 <=> f32 (Round) + +template <class DTo, typename TFrom, typename TTo = TFromD<DTo>> +HWY_API Vec256<TTo> ConvertTo(DTo d, const Vec256<TFrom> v) { + const Half<decltype(d)> dh; + Vec256<TTo> ret; + ret.v0 = ConvertTo(dh, v.v0); + ret.v1 = ConvertTo(dh, v.v1); + return ret; +} + +HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) { + return ConvertTo(Full256<int32_t>(), Round(v)); +} + +// ================================================== MISC + +// ------------------------------ LoadMaskBits (TestBit) + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_D(D, 32), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + const Half<decltype(d)> dh; + MFromD<D> ret; + ret.m0 = LoadMaskBits(dh, bits); + // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8. + // Both halves fit in one byte's worth of mask bits. + constexpr size_t kBitsPerHalf = 16 / sizeof(TFromD<D>); + const uint8_t bits_upper[8] = {static_cast<uint8_t>(bits[0] >> kBitsPerHalf)}; + ret.m1 = LoadMaskBits(dh, bits_upper); + return ret; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + const Half<decltype(d)> dh; + MFromD<D> ret; + ret.m0 = LoadMaskBits(dh, bits); + constexpr size_t kLanesPerHalf = 16 / sizeof(TFromD<D>); + constexpr size_t kBytesPerHalf = kLanesPerHalf / 8; + static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes"); + ret.m1 = LoadMaskBits(dh, bits + kBytesPerHalf); + return ret; +} + +// ------------------------------ Mask + +// `p` points to at least 8 writable bytes. +template <class D, typename T = TFromD<D>, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> +HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) { + const Half<decltype(d)> dh; + StoreMaskBits(dh, mask.m0, bits); + const uint8_t lo = bits[0]; + StoreMaskBits(dh, mask.m1, bits); + // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8. + // Both halves fit in one byte's worth of mask bits. + constexpr size_t kBitsPerHalf = 16 / sizeof(T); + bits[0] = static_cast<uint8_t>(lo | (bits[0] << kBitsPerHalf)); + return (kBitsPerHalf * 2 + 7) / 8; +} + +template <class D, typename T = TFromD<D>, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) { + const Half<decltype(d)> dh; + constexpr size_t kLanesPerHalf = 16 / sizeof(T); + constexpr size_t kBytesPerHalf = kLanesPerHalf / 8; + static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes"); + StoreMaskBits(dh, mask.m0, bits); + StoreMaskBits(dh, mask.m1, bits + kBytesPerHalf); + return kBytesPerHalf * 2; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t CountTrue(D d, const Mask256<T> m) { + const Half<decltype(d)> dh; + return CountTrue(dh, m.m0) + CountTrue(dh, m.m1); +} + +template <class D, typename T = TFromD<D>> +HWY_API bool AllFalse(D d, const Mask256<T> m) { + const Half<decltype(d)> dh; + return AllFalse(dh, m.m0) && AllFalse(dh, m.m1); +} + +template <class D, typename T = TFromD<D>> +HWY_API bool AllTrue(D d, const Mask256<T> m) { + const Half<decltype(d)> dh; + return AllTrue(dh, m.m0) && AllTrue(dh, m.m1); +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownFirstTrue(D d, const Mask256<T> mask) { + const Half<decltype(d)> dh; + const intptr_t lo = FindFirstTrue(dh, mask.m0); // not known + constexpr size_t kLanesPerHalf = 16 / sizeof(T); + return lo >= 0 ? static_cast<size_t>(lo) + : kLanesPerHalf + FindKnownFirstTrue(dh, mask.m1); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindFirstTrue(D d, const Mask256<T> mask) { + const Half<decltype(d)> dh; + const intptr_t lo = FindFirstTrue(dh, mask.m0); + constexpr int kLanesPerHalf = 16 / sizeof(T); + if (lo >= 0) return lo; + + const intptr_t hi = FindFirstTrue(dh, mask.m1); + return hi + (hi >= 0 ? kLanesPerHalf : 0); +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownLastTrue(D d, const Mask256<T> mask) { + const Half<decltype(d)> dh; + const intptr_t hi = FindLastTrue(dh, mask.m1); // not known + constexpr size_t kLanesPerHalf = 16 / sizeof(T); + return hi >= 0 ? kLanesPerHalf + static_cast<size_t>(hi) + : FindKnownLastTrue(dh, mask.m0); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindLastTrue(D d, const Mask256<T> mask) { + const Half<decltype(d)> dh; + constexpr int kLanesPerHalf = 16 / sizeof(T); + const intptr_t hi = FindLastTrue(dh, mask.m1); + return hi >= 0 ? kLanesPerHalf + hi : FindLastTrue(dh, mask.m0); +} + +// ------------------------------ CompressStore +template <class D, typename T = TFromD<D>> +HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, D d, + T* HWY_RESTRICT unaligned) { + const Half<decltype(d)> dh; + const size_t count = CompressStore(v.v0, mask.m0, dh, unaligned); + const size_t count2 = CompressStore(v.v1, mask.m1, dh, unaligned + count); + return count + count2; +} + +// ------------------------------ CompressBlendedStore +template <class D, typename T = TFromD<D>> +HWY_API size_t CompressBlendedStore(Vec256<T> v, const Mask256<T> m, D d, + T* HWY_RESTRICT unaligned) { + const Half<decltype(d)> dh; + const size_t count = CompressBlendedStore(v.v0, m.m0, dh, unaligned); + const size_t count2 = CompressBlendedStore(v.v1, m.m1, dh, unaligned + count); + return count + count2; +} + +// ------------------------------ CompressBitsStore + +template <class D, typename T = TFromD<D>> +HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits, + D d, T* HWY_RESTRICT unaligned) { + const Mask256<T> m = LoadMaskBits(d, bits); + return CompressStore(v, m, d, unaligned); +} + +// ------------------------------ Compress +template <typename T> +HWY_API Vec256<T> Compress(const Vec256<T> v, const Mask256<T> mask) { + const DFromV<decltype(v)> d; + alignas(32) T lanes[32 / sizeof(T)] = {}; + (void)CompressStore(v, mask, d, lanes); + return Load(d, lanes); +} + +// ------------------------------ CompressNot +template <typename T> +HWY_API Vec256<T> CompressNot(Vec256<T> v, const Mask256<T> mask) { + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v, + Mask256<uint64_t> mask) { + const Full128<uint64_t> dh; + // Because the non-selected (mask=1) blocks are undefined, we can return the + // input unless mask = 01, in which case we must bring down the upper block. + return AllTrue(dh, AndNot(mask.m1, mask.m0)) ? SwapAdjacentBlocks(v) : v; +} + +// ------------------------------ CompressBits +template <typename T> +HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) { + const Mask256<T> m = LoadMaskBits(DFromV<decltype(v)>(), bits); + return Compress(v, m); +} + +// ------------------------------ Expand +template <typename T> +HWY_API Vec256<T> Expand(const Vec256<T> v, const Mask256<T> mask) { + Vec256<T> ret; + const Full256<T> d; + const Half<decltype(d)> dh; + alignas(32) T lanes[32 / sizeof(T)] = {}; + Store(v, d, lanes); + ret.v0 = Expand(v.v0, mask.m0); + ret.v1 = Expand(LoadU(dh, lanes + CountTrue(dh, mask.m0)), mask.m1); + return ret; +} + +// ------------------------------ LoadExpand +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + return Expand(LoadU(d, unaligned), mask); +} + +// ------------------------------ LoadInterleaved3/4 + +// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. + +namespace detail { + +// Input: +// 1 0 (<- first block of unaligned) +// 3 2 +// 5 4 +// Output: +// 3 0 +// 4 1 +// 5 2 +template <class D, typename T = TFromD<D>> +HWY_API void LoadTransposedBlocks3(D d, const T* HWY_RESTRICT unaligned, + Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) { + const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d)); + const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d)); + const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d)); + + A = ConcatUpperLower(d, v32, v10); + B = ConcatLowerUpper(d, v54, v10); + C = ConcatUpperLower(d, v54, v32); +} + +// Input (128-bit blocks): +// 1 0 (first block of unaligned) +// 3 2 +// 5 4 +// 7 6 +// Output: +// 4 0 (LSB of A) +// 5 1 +// 6 2 +// 7 3 +template <class D, typename T = TFromD<D>> +HWY_API void LoadTransposedBlocks4(D d, const T* HWY_RESTRICT unaligned, + Vec256<T>& vA, Vec256<T>& vB, Vec256<T>& vC, + Vec256<T>& vD) { + const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d)); + const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d)); + const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d)); + const Vec256<T> v76 = LoadU(d, unaligned + 3 * MaxLanes(d)); + + vA = ConcatLowerLower(d, v54, v10); + vB = ConcatUpperUpper(d, v54, v10); + vC = ConcatLowerLower(d, v76, v32); + vD = ConcatUpperUpper(d, v76, v32); +} + +} // namespace detail + +// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) + +// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. + +namespace detail { + +// Input (128-bit blocks): +// 2 0 (LSB of i) +// 3 1 +// Output: +// 1 0 +// 3 2 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks2(Vec256<T> i, Vec256<T> j, D d, + T* HWY_RESTRICT unaligned) { + const Vec256<T> out0 = ConcatLowerLower(d, j, i); + const Vec256<T> out1 = ConcatUpperUpper(d, j, i); + StoreU(out0, d, unaligned + 0 * MaxLanes(d)); + StoreU(out1, d, unaligned + 1 * MaxLanes(d)); +} + +// Input (128-bit blocks): +// 3 0 (LSB of i) +// 4 1 +// 5 2 +// Output: +// 1 0 +// 3 2 +// 5 4 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks3(Vec256<T> i, Vec256<T> j, Vec256<T> k, D d, + T* HWY_RESTRICT unaligned) { + const Vec256<T> out0 = ConcatLowerLower(d, j, i); + const Vec256<T> out1 = ConcatUpperLower(d, i, k); + const Vec256<T> out2 = ConcatUpperUpper(d, k, j); + StoreU(out0, d, unaligned + 0 * MaxLanes(d)); + StoreU(out1, d, unaligned + 1 * MaxLanes(d)); + StoreU(out2, d, unaligned + 2 * MaxLanes(d)); +} + +// Input (128-bit blocks): +// 4 0 (LSB of i) +// 5 1 +// 6 2 +// 7 3 +// Output: +// 1 0 +// 3 2 +// 5 4 +// 7 6 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks4(Vec256<T> i, Vec256<T> j, Vec256<T> k, + Vec256<T> l, D d, + T* HWY_RESTRICT unaligned) { + // Write lower halves, then upper. + const Vec256<T> out0 = ConcatLowerLower(d, j, i); + const Vec256<T> out1 = ConcatLowerLower(d, l, k); + StoreU(out0, d, unaligned + 0 * MaxLanes(d)); + StoreU(out1, d, unaligned + 1 * MaxLanes(d)); + const Vec256<T> out2 = ConcatUpperUpper(d, j, i); + const Vec256<T> out3 = ConcatUpperUpper(d, l, k); + StoreU(out2, d, unaligned + 2 * MaxLanes(d)); + StoreU(out3, d, unaligned + 3 * MaxLanes(d)); +} + +} // namespace detail + +// ------------------------------ WidenMulPairwiseAdd +template <class D32, typename T16, typename T32 = TFromD<D32>> +HWY_API Vec256<T32> WidenMulPairwiseAdd(D32 d32, Vec256<T16> a, + Vec256<T16> b) { + const Half<decltype(d32)> d32h; + a.v0 = WidenMulPairwiseAdd(d32h, a.v0, b.v0); + a.v1 = WidenMulPairwiseAdd(d32h, a.v1, b.v1); + return a; +} + +// ------------------------------ ReorderWidenMulAccumulate +template <class D32, typename T16, typename T32 = TFromD<D32>> +HWY_API Vec256<T32> ReorderWidenMulAccumulate(D32 d32, Vec256<T16> a, + Vec256<T16> b, Vec256<T32> sum0, + Vec256<T32>& sum1) { + const Half<decltype(d32)> d32h; + sum0.v0 = ReorderWidenMulAccumulate(d32h, a.v0, b.v0, sum0.v0, sum1.v0); + sum0.v1 = ReorderWidenMulAccumulate(d32h, a.v1, b.v1, sum0.v1, sum1.v1); + return sum0; +} + +// ------------------------------ RearrangeToOddPlusEven +template <typename TW> +HWY_API Vec256<TW> RearrangeToOddPlusEven(Vec256<TW> sum0, Vec256<TW> sum1) { + sum0.v0 = RearrangeToOddPlusEven(sum0.v0, sum1.v0); + sum0.v1 = RearrangeToOddPlusEven(sum0.v1, sum1.v1); + return sum0; +} + +// ------------------------------ Reductions + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> SumOfLanes(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + const Vec128<T> lo = SumOfLanes(dh, Add(v.v0, v.v1)); + return Combine(d, lo, lo); +} + +template <class D, typename T = TFromD<D>> +HWY_API T ReduceSum(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + return ReduceSum(dh, Add(v.v0, v.v1)); +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> MinOfLanes(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + const Vec128<T> lo = MinOfLanes(dh, Min(v.v0, v.v1)); + return Combine(d, lo, lo); +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> MaxOfLanes(D d, const Vec256<T> v) { + const Half<decltype(d)> dh; + const Vec128<T> lo = MaxOfLanes(dh, Max(v.v0, v.v1)); + return Combine(d, lo, lo); +} + +// ------------------------------ Lt128 + +template <class D, typename T = TFromD<D>> +HWY_INLINE Mask256<T> Lt128(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Mask256<T> ret; + ret.m0 = Lt128(dh, a.v0, b.v0); + ret.m1 = Lt128(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Mask256<T> Lt128Upper(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Mask256<T> ret; + ret.m0 = Lt128Upper(dh, a.v0, b.v0); + ret.m1 = Lt128Upper(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Mask256<T> Eq128(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Mask256<T> ret; + ret.m0 = Eq128(dh, a.v0, b.v0); + ret.m1 = Eq128(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Mask256<T> Eq128Upper(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Mask256<T> ret; + ret.m0 = Eq128Upper(dh, a.v0, b.v0); + ret.m1 = Eq128Upper(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Mask256<T> Ne128(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Mask256<T> ret; + ret.m0 = Ne128(dh, a.v0, b.v0); + ret.m1 = Ne128(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Mask256<T> Ne128Upper(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Mask256<T> ret; + ret.m0 = Ne128Upper(dh, a.v0, b.v0); + ret.m1 = Ne128Upper(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec256<T> Min128(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = Min128(dh, a.v0, b.v0); + ret.v1 = Min128(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec256<T> Max128(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = Max128(dh, a.v0, b.v0); + ret.v1 = Max128(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec256<T> Min128Upper(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = Min128Upper(dh, a.v0, b.v0); + ret.v1 = Min128Upper(dh, a.v1, b.v1); + return ret; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec256<T> Max128Upper(D d, Vec256<T> a, Vec256<T> b) { + const Half<decltype(d)> dh; + Vec256<T> ret; + ret.v0 = Max128Upper(dh, a.v0, b.v0); + ret.v1 = Max128Upper(dh, a.v1, b.v1); + return ret; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/third_party/highway/hwy/ops/x86_128-inl.h b/third_party/highway/hwy/ops/x86_128-inl.h new file mode 100644 index 0000000000..e98c22ff35 --- /dev/null +++ b/third_party/highway/hwy/ops/x86_128-inl.h @@ -0,0 +1,9038 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL +// operations when compiling for those targets. +// External include guard in highway.h - see comment there. + +// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL +#include "hwy/base.h" + +// Avoid uninitialized warnings in GCC's emmintrin.h - see +// https://github.com/google/highway/issues/710 and pull/902 +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, + ignored "-Wmaybe-uninitialized") +#endif + +#include <emmintrin.h> +#include <stdio.h> +#if HWY_TARGET == HWY_SSSE3 +#include <tmmintrin.h> // SSSE3 +#elif HWY_TARGET <= HWY_SSE4 +#include <smmintrin.h> // SSE4 +#ifndef HWY_DISABLE_PCLMUL_AES +#include <wmmintrin.h> // CLMUL +#endif +#endif +#include <string.h> // memcpy + +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +template <typename T> +struct Raw128 { + using type = __m128i; +}; +template <> +struct Raw128<float> { + using type = __m128; +}; +template <> +struct Raw128<double> { + using type = __m128d; +}; + +} // namespace detail + +template <typename T, size_t N = 16 / sizeof(T)> +class Vec128 { + using Raw = typename detail::Raw128<T>::type; + + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = N; // only for DFromV + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template <typename T> +using Vec64 = Vec128<T, 8 / sizeof(T)>; + +template <typename T> +using Vec32 = Vec128<T, 4 / sizeof(T)>; + +template <typename T> +using Vec16 = Vec128<T, 2 / sizeof(T)>; + +#if HWY_TARGET <= HWY_AVX3 + +namespace detail { + +// Template arg: sizeof(lane type) +template <size_t size> +struct RawMask128 {}; +template <> +struct RawMask128<1> { + using type = __mmask16; +}; +template <> +struct RawMask128<2> { + using type = __mmask8; +}; +template <> +struct RawMask128<4> { + using type = __mmask8; +}; +template <> +struct RawMask128<8> { + using type = __mmask8; +}; + +} // namespace detail + +template <typename T, size_t N = 16 / sizeof(T)> +struct Mask128 { + using Raw = typename detail::RawMask128<sizeof(T)>::type; + + static Mask128<T, N> FromBits(uint64_t mask_bits) { + return Mask128<T, N>{static_cast<Raw>(mask_bits)}; + } + + Raw raw; +}; + +#else // AVX2 or below + +// FF..FF or 0. +template <typename T, size_t N = 16 / sizeof(T)> +struct Mask128 { + typename detail::Raw128<T>::type raw; +}; + +#endif // AVX2 or below + +namespace detail { + +// Returns the lowest N of the _mm_movemask* bits. +template <typename T, size_t N> +constexpr uint64_t OnlyActive(uint64_t mask_bits) { + return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); +} + +} // namespace detail + +#if HWY_TARGET <= HWY_AVX3 +namespace detail { + +// Used by Expand() emulation, which is required for both AVX3 and AVX2. +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) { + return OnlyActive<T, N>(mask.raw); +} + +} // namespace detail +#endif // HWY_TARGET <= HWY_AVX3 + +template <class V> +using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; + +template <class V> +using TFromV = typename V::PrivateT; + +// ------------------------------ Zero + +// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_D(D)> +HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { + return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()}; +} + +// Using the existing Zero function instead of a dedicated function for +// deduction avoids having to forward-declare Vec256 here. +template <class D> +using VFromD = decltype(Zero(D())); + +// ------------------------------ Tuple (VFromD) +#include "hwy/ops/tuple-inl.h" + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } +HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } +HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } + +template <typename T, size_t N> +HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { + return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template <typename T> +struct BitCastFromInteger128 { + HWY_INLINE __m128i operator()(__m128i v) { return v; } +}; +template <> +struct BitCastFromInteger128<float> { + HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } +}; +template <> +struct BitCastFromInteger128<double> { + HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } +}; + +template <class D> +HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, + Vec128<uint8_t, D().MaxBytes()> v) { + return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)}; +} + +} // namespace detail + +template <class D, typename FromT> +HWY_API VFromD<D> BitCast(D d, + Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Set + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm_set1_epi32(static_cast<int>(t))}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> Set(D /* tag */, float t) { + return VFromD<D>{_mm_set1_ps(t)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> Set(D /* tag */, double t) { + return VFromD<D>{_mm_set1_pd(t)}; +} + +// Generic for all vector lengths. +template <class D, HWY_IF_SPECIAL_FLOAT_D(D)> +HWY_API VFromD<D> Set(D df, TFromD<D> t) { + const RebindToUnsigned<decltype(df)> du; + static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16"); + uint16_t bits; + CopyBytes<2>(&t, &bits); + return BitCast(df, Set(du, bits)); +} + +// ------------------------------ Undefined + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_D(D)> +HWY_API VFromD<D> Undefined(D /* tag */) { + // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC + // generate an XOR instruction. + return VFromD<D>{_mm_undefined_si128()}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> Undefined(D /* tag */) { + return VFromD<D>{_mm_undefined_ps()}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> Undefined(D /* tag */) { + return VFromD<D>{_mm_undefined_pd()}; +} + +HWY_DIAGNOSTICS(pop) + +// ------------------------------ GetLane + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API T GetLane(const Vec128<T, N> v) { + return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF); +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API T GetLane(const Vec128<T, N> v) { + return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFFFF); +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API T GetLane(const Vec128<T, N> v) { + return static_cast<T>(_mm_cvtsi128_si32(v.raw)); +} +template <size_t N> +HWY_API float GetLane(const Vec128<float, N> v) { + return _mm_cvtss_f32(v.raw); +} +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API T GetLane(const Vec128<T, N> v) { +#if HWY_ARCH_X86_32 + const DFromV<decltype(v)> d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + return lanes[0]; +#else + return static_cast<T>(_mm_cvtsi128_si64(v.raw)); +#endif +} +template <size_t N> +HWY_API double GetLane(const Vec128<double, N> v) { + return _mm_cvtsd_f64(v.raw); +} + +// ------------------------------ ResizeBitCast + +template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), + HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)}); +} + +// ================================================== LOGICAL + +// ------------------------------ And + +template <typename T, size_t N> +HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<float, N> And(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> And(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template <typename T, size_t N> +HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { + return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)}; +} +template <size_t N> +HWY_API Vec128<float, N> AndNot(const Vec128<float, N> not_mask, + const Vec128<float, N> mask) { + return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> AndNot(const Vec128<double, N> not_mask, + const Vec128<double, N> mask) { + return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)}; +} + +// ------------------------------ Or + +template <typename T, size_t N> +HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> Or(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> Or(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { + return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> Xor(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> Xor(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)}; +} + +// ------------------------------ Not +template <typename T, size_t N> +HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; +#if HWY_TARGET <= HWY_AVX3 + const __m128i vu = BitCast(du, v).raw; + return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); +#else + return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)})); +#endif +} + +// ------------------------------ Xor3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(x1)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m128i ret = _mm_ternarylogic_epi64( + BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); + return BitCast(d, VU{ret}); +#else + return Xor(x1, Xor(x2, x3)); +#endif +} + +// ------------------------------ Or3 +template <typename T, size_t N> +HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(o1)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m128i ret = _mm_ternarylogic_epi64( + BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); + return BitCast(d, VU{ret}); +#else + return Or(o1, Or(o2, o3)); +#endif +} + +// ------------------------------ OrAnd +template <typename T, size_t N> +HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(o)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m128i ret = _mm_ternarylogic_epi64( + BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); + return BitCast(d, VU{ret}); +#else + return Or(o, And(a1, a2)); +#endif +} + +// ------------------------------ IfVecThenElse +template <typename T, size_t N> +HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(no)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast( + d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, + BitCast(du, no).raw, 0xCA)}); +#else + return IfThenElse(MaskFromVec(mask), yes, no); +#endif +} + +// ------------------------------ BitwiseIfThenElse +#if HWY_TARGET <= HWY_AVX3 + +#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE +#else +#define HWY_NATIVE_BITWISE_IF_THEN_ELSE +#endif + +template <class V> +HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { + return IfVecThenElse(mask, yes, no); +} + +#endif + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { + return And(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { + return Or(a, b); +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +// 8/16 require BITALG, 32/64 require VPOPCNTDQ. +#if HWY_TARGET <= HWY_AVX3_DL + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, + Vec128<T, N> v) { + return Vec128<T, N>{_mm_popcnt_epi8(v.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, + Vec128<T, N> v) { + return Vec128<T, N>{_mm_popcnt_epi16(v.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, + Vec128<T, N> v) { + return Vec128<T, N>{_mm_popcnt_epi32(v.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, + Vec128<T, N> v) { + return Vec128<T, N>{_mm_popcnt_epi64(v.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { + return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ================================================== SIGN + +// ------------------------------ Neg + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) { + return Xor(v, SignBit(DFromV<decltype(v)>())); +} + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, const Vec128<T, N> v) { + return Zero(DFromV<decltype(v)>()) - v; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) { + return detail::Neg(hwy::IsFloatTag<T>(), v); +} + +// ------------------------------ Floating-point Abs + +// Returns absolute value +template <size_t N> +HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) { + const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)}; + return v & BitCast(DFromV<decltype(v)>(), mask); +} +template <size_t N> +HWY_API Vec128<double, N> Abs(const Vec128<double, N> v) { + const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)}; + return v & BitCast(DFromV<decltype(v)>(), mask); +} + +// ------------------------------ CopySign + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn, + const Vec128<T, N> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + + const DFromV<decltype(magn)> d; + const auto msb = SignBit(d); + +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + // Truth table for msb, magn, sign | bitwise msb ? sign : mag + // 0 0 0 | 0 + // 0 0 1 | 0 + // 0 1 0 | 1 + // 0 1 1 | 1 + // 1 0 0 | 0 + // 1 0 1 | 1 + // 1 1 0 | 0 + // 1 1 1 | 1 + // The lane size does not matter because we are not using predication. + const __m128i out = _mm_ternarylogic_epi32( + BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); + return BitCast(d, VFromD<decltype(du)>{out}); +#else + return Or(AndNot(msb, magn), And(msb, sign)); +#endif +} + +template <typename T, size_t N> +HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs, + const Vec128<T, N> sign) { +#if HWY_TARGET <= HWY_AVX3 + // AVX3 can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +#else + return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign)); +#endif +} + +// ================================================== MASK + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ IfThenElse + +// Returns mask ? b : a. + +namespace detail { + +// Templates for signed/unsigned integer of a particular size. +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no); +} + +template <size_t N> +HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask, + Vec128<float, N> yes, Vec128<float, N> no) { + return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)}; +} + +template <size_t N> +HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask, + Vec128<double, N> yes, + Vec128<double, N> no) { + return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)}; +} + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes) { + return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes) { + return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes) { + return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */, + Mask128<T, N> mask, Vec128<T, N> yes) { + return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { + return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes); +} + +template <size_t N> +HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask, + Vec128<float, N> yes) { + return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)}; +} + +template <size_t N> +HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask, + Vec128<double, N> yes) { + return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)}; +} + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */, + Mask128<T, N> mask, Vec128<T, N> no) { + // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. + return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */, + Mask128<T, N> mask, Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */, + Mask128<T, N> mask, Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */, + Mask128<T, N> mask, Vec128<T, N> no) { + return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { + return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no); +} + +template <size_t N> +HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask, + Vec128<float, N> no) { + return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; +} + +template <size_t N> +HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask, + Vec128<double, N> no) { + return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; +} + +// ------------------------------ Mask logical + +// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. +#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) +#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \ + HWY_COMPILER_CLANG >= 800 +#define HWY_COMPILER_HAS_MASK_INTRINSICS 1 +#else +#define HWY_COMPILER_HAS_MASK_INTRINSICS 0 +#endif +#endif // HWY_COMPILER_HAS_MASK_INTRINSICS + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kand_mask16(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kor_mask16(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/, + const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/, + const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/, + const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; +#endif +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/, + const Mask128<T, N> a, + const Mask128<T, N> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)}; +#else + return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)}; +#endif +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { + return detail::And(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { + return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { + return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { + return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { + // Flip only the valid bits. + // TODO(janwas): use _knot intrinsics if N >= 8. + return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1)); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { + return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b); +} + +#else // AVX2 or below + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template <typename T, size_t N> +HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { + return Mask128<T, N>{v.raw}; +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +template <typename T, size_t N> +HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { + return Vec128<T, N>{v.raw}; +} + +template <class D> +HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { + return VFromD<D>{v.raw}; +} + +#if HWY_TARGET >= HWY_SSSE3 + +// mask ? yes : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask); + return Or(And(vmask, yes), AndNot(vmask, no)); +} + +#else // HWY_TARGET < HWY_SSSE3 + +// mask ? yes : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, + Vec128<T, N> no) { + return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; +} +template <size_t N> +HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask, + const Vec128<float, N> yes, + const Vec128<float, N> no) { + return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask, + const Vec128<double, N> yes, + const Vec128<double, N> no) { + return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; +} + +#endif // HWY_TARGET >= HWY_SSSE3 + +// mask ? yes : 0 +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { + return yes & VecFromMask(DFromV<decltype(yes)>(), mask); +} + +// mask ? 0 : no +template <typename T, size_t N> +HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { + return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); +} + +// ------------------------------ Mask logical + +template <typename T, size_t N> +HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { + const Simd<T, N, 0> d; + return MaskFromVec(Not(VecFromMask(d, m))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { + const Simd<T, N, 0> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ ShiftLeft + +template <int kBits, size_t N> +HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { + return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)}; +} + +template <int kBits, size_t N> +HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { + return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)}; +} + +template <int kBits, size_t N> +HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { + return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)}; +} + +template <int kBits, size_t N> +HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { + return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)}; +} + +template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); +} + +// ------------------------------ ShiftRight + +template <int kBits, size_t N> +HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) { + return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) { + return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) { + return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)}; +} + +template <int kBits, size_t N> +HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<uint8_t, N> shifted{ + ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template <int kBits, size_t N> +HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) { + return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)}; +} +template <int kBits, size_t N> +HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) { + return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)}; +} + +template <int kBits, size_t N> +HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// i64 is implemented after BroadcastSignBit. + +// ================================================== MEMORY (1) + +// Clang static analysis claims the memory immediately after a partial vector +// store is uninitialized, and also flags the input to partial loads (at least +// for loadl_pd) as "garbage". This is a false alarm because msan does not +// raise errors. We work around this by using CopyBytes instead of intrinsics, +// but only for the analyzer to avoid potentially bad code generation. +// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. +#ifndef HWY_SAFE_PARTIAL_LOAD_STORE +#if defined(__clang_analyzer__) || \ + (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) +#define HWY_SAFE_PARTIAL_LOAD_STORE 1 +#else +#define HWY_SAFE_PARTIAL_LOAD_STORE 0 +#endif +#endif // HWY_SAFE_PARTIAL_LOAD_STORE + +// ------------------------------ Load + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { + return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))}; +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) { + return Vec128<float>{_mm_load_ps(aligned)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API Vec128<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) { + return Vec128<double>{_mm_load_pd(aligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) { + return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))}; +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { + return Vec128<float>{_mm_loadu_ps(p)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API Vec128<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { + return Vec128<double>{_mm_loadu_pd(p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), typename T = TFromD<D>> +HWY_API Vec64<T> Load(D /* tag */, const T* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128i v = _mm_setzero_si128(); + CopyBytes<8>(p, &v); // not same size + return Vec64<T>{v}; +#else + return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API Vec64<float> Load(D /* tag */, const float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128 v = _mm_setzero_ps(); + CopyBytes<8>(p, &v); // not same size + return Vec64<float>{v}; +#else + const __m128 hi = _mm_setzero_ps(); + return Vec64<float>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> +HWY_API Vec64<double> Load(D /* tag */, const double* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128d v = _mm_setzero_pd(); + CopyBytes<8>(p, &v); // not same size + return Vec64<double>{v}; +#else + return Vec64<double>{_mm_load_sd(p)}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> +HWY_API Vec32<float> Load(D /* tag */, const float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128 v = _mm_setzero_ps(); + CopyBytes<4>(p, &v); // not same size + return Vec32<float>{v}; +#else + return Vec32<float>{_mm_load_ss(p)}; +#endif +} + +// Any <= 32 bit except <float, 1> +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) { + // Clang ArgumentPromotionPass seems to break this code. We can unpoison + // before SetTableIndices -> LoadU -> Load and the memory is poisoned again. + detail::MaybeUnpoison(p, Lanes(d)); + +#if HWY_SAFE_PARTIAL_LOAD_STORE + Vec128<T> v = Zero(Full128<T>()); + CopyBytes<d.MaxBytes()>(p, &v.raw); // not same size as VFromD + return VFromD<D>{v.raw}; +#else + int32_t bits = 0; + CopyBytes<d.MaxBytes()>(p, &bits); // not same size as VFromD + return VFromD<D>{_mm_cvtsi32_si128(bits)}; +#endif +} + +// For < 128 bit, LoadU == Load. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> +HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>> +HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) { + return LoadU(d, p); +} + +// ------------------------------ Store + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) { + _mm_store_ps(aligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API void Store(Vec128<double> v, D /* tag */, + double* HWY_RESTRICT aligned) { + _mm_store_pd(aligned, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) { + _mm_storeu_ps(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API void StoreU(Vec128<double> v, D /* tag */, double* HWY_RESTRICT p) { + _mm_storeu_pd(p, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 8), typename T = TFromD<D>> +HWY_API void Store(Vec64<T> v, D /* tag */, T* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<8>(&v, p); // not same size +#else + _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw); +#endif +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API void Store(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<8>(&v, p); // not same size +#else + _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); +#endif +} +template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> +HWY_API void Store(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<8>(&v, p); // not same size +#else + _mm_storel_pd(p, v.raw); +#endif +} + +// Any <= 32 bit except <float, 1> +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>> +HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) { + CopyBytes<d.MaxBytes()>(&v, p); // not same size +} +template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> +HWY_API void Store(Vec32<float> v, D /* tag */, float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<4>(&v, p); // not same size +#else + _mm_store_ss(p, v.raw); +#endif +} + +// For < 128 bit, StoreU == Store. +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> +HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +// ================================================== SWIZZLE (1) + +// ------------------------------ TableLookupBytes +template <typename T, size_t N, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, + const Vec128<TI, NI> from) { +#if HWY_TARGET == HWY_SSE2 +#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) + typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); + return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>( + __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw), + reinterpret_cast<GccU8RawVectType>(from.raw)))}; +#else + const DFromV<decltype(from)> d; + const Repartition<uint8_t, decltype(d)> du8; + const Full128<uint8_t> du8_full; + + const DFromV<decltype(bytes)> d_bytes; + const Repartition<uint8_t, decltype(d_bytes)> du8_bytes; + + alignas(16) uint8_t result_bytes[16]; + alignas(16) uint8_t u8_bytes[16]; + alignas(16) uint8_t from_bytes[16]; + + Store(Vec128<uint8_t>{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes); + Store(Vec128<uint8_t>{BitCast(du8, from).raw}, du8_full, from_bytes); + + for (int i = 0; i < 16; i++) { + result_bytes[i] = u8_bytes[from_bytes[i] & 15]; + } + + return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw}); +#endif +#else // SSSE3 or newer + return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)}; +#endif +} + +// ------------------------------ TableLookupBytesOr0 +// For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3 +template <class V, class VI> +HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { +#if HWY_TARGET == HWY_SSE2 + const DFromV<decltype(from)> d; + const Repartition<int8_t, decltype(d)> di8; + + const auto di8_from = BitCast(di8, from); + return BitCast(d, IfThenZeroElse(di8_from < Zero(di8), + TableLookupBytes(bytes, di8_from))); +#else + return TableLookupBytes(bytes, from); +#endif +} + +// ------------------------------ Shuffles (ShiftRight, TableLookupBytes) + +// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template <typename T, size_t N> +HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)}; +} +template <size_t N> +HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; +} + +// These are used by generic_ops-inl to implement LoadInterleaved3. As with +// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output +// comes from the first argument. +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> d2; + const auto ba = Combine(d2, b, a); +#if HWY_TARGET == HWY_SSE2 + Vec32<uint16_t> ba_shuffled{ + _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; + return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled))); +#else + alignas(16) const T kShuffle[8] = {1, 0, 7, 6}; + return Vec32<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +#endif +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> d2; + const auto ba = Combine(d2, b, a); +#if HWY_TARGET == HWY_SSE2 + Vec64<uint32_t> ba_shuffled{ + _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; + return Vec64<T>{ + _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))}; +#else + alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c}; + return Vec64<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +#endif +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); + return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) { + const DFromV<decltype(a)> d; +#if HWY_TARGET == HWY_SSE2 + const auto zero = Zero(d); + const Rebind<int16_t, decltype(d)> di16; + const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16( + _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; + const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16( + _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; + const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); + return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; +#else + const Twice<decltype(d)> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0, 3, 6, 5}; + return Vec32<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +#endif +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) { + const DFromV<decltype(a)> d; +#if HWY_TARGET == HWY_SSE2 + const Vec32<T> a_shuffled{ + _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))}; + const Vec32<T> b_shuffled{ + _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))}; + return Combine(d, b_shuffled, a_shuffled); +#else + const Twice<decltype(d)> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a}; + return Vec64<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +#endif +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); + return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) { + const DFromV<decltype(a)> d; +#if HWY_TARGET == HWY_SSE2 + const auto zero = Zero(d); + const Rebind<int16_t, decltype(d)> di16; + const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16( + _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; + const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16( + _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; + const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); + return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; +#else + const Twice<decltype(d)> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {2, 1, 4, 7}; + return Vec32<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +#endif +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) { + const DFromV<decltype(a)> d; +#if HWY_TARGET == HWY_SSE2 + const Vec32<T> a_shuffled{ + _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))}; + const Vec32<T> b_shuffled{ + _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))}; + return Combine(d, b_shuffled, a_shuffled); +#else + const Twice<decltype(d)> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e}; + return Vec64<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +#endif +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); + return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +} // namespace detail + +// Swap 64-bit halves +HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) { + return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) { + return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) { + return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; +} +HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) { + return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) { + return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128<double> Shuffle01(const Vec128<double> v) { + return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)}; +} + +// Rotate right 32 bits +HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) { + return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) { + return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) { + return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; +} +// Rotate left 32 bits +HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) { + return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) { + return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) { + return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; +} + +// Reverse +HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) { + return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) { + return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) { + return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; +} + +// ================================================== COMPARE + +#if HWY_TARGET <= HWY_AVX3 + +// Comparisons set a mask bit to 1 if the condition is true, else 0. + +// ------------------------------ MaskFromVec + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/, + const Vec128<T, N> v) { + return Mask128<T, N>{_mm_movepi8_mask(v.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/, + const Vec128<T, N> v) { + return Mask128<T, N>{_mm_movepi16_mask(v.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/, + const Vec128<T, N> v) { + return Mask128<T, N>{_mm_movepi32_mask(v.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/, + const Vec128<T, N> v) { + return Mask128<T, N>{_mm_movepi64_mask(v.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { + return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v); +} +// There do not seem to be native floating-point versions of these instructions. +template <size_t N> +HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw}; +} +template <size_t N> +HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw}; +} + +template <class D> +using MFromD = decltype(MaskFromVec(VFromD<D>())); + +// ------------------------------ VecFromMask + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { + return Vec128<T, N>{_mm_movm_epi8(v.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { + return Vec128<T, N>{_mm_movm_epi16(v.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { + return Vec128<T, N>{_mm_movm_epi32(v.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { + return Vec128<T, N>{_mm_movm_epi64(v.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) { + return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; +} + +template <size_t N> +HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) { + return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; +} + +template <class D> +HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { + return VecFromMask(v); +} + +// ------------------------------ RebindMask (MaskFromVec) + +template <typename TFrom, size_t NFrom, class DTo> +HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); + return MFromD<DTo>{m.raw}; +} + +// ------------------------------ TestBit + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v, + const Vec128<T, N> bit) { + return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v, + const Vec128<T, N> bit) { + return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v, + const Vec128<T, N> bit) { + return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)}; +} +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v, + const Vec128<T, N> bit) { + return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit); +} + +// ------------------------------ Equality + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +template <size_t N> +HWY_API Mask128<double, N> operator==(Vec128<double, N> a, + Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { + return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +template <size_t N> +HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, + Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +// Signed/float < +template <size_t N> +HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} +template <size_t N> +HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} + +// ------------------------------ Weak inequality + +template <size_t N> +HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} +template <size_t N> +HWY_API Mask128<double, N> operator>=(Vec128<double, N> a, + Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} + +template <size_t N> +HWY_API Mask128<int8_t, N> operator>=(Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator>=(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator>=(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator>=(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Mask128<uint8_t, N> operator>=(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator>=(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator>=(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator>=(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)}; +} + +#else // AVX2 or below + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template <class DTo, typename TFrom, size_t NFrom> +HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); + const Simd<TFrom, NFrom, 0> d; + return MaskFromVec(BitCast(dto, VecFromMask(d, m))); +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +// Unsigned +template <size_t N> +HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + const DFromV<decltype(a)> d64; + const RepartitionToNarrow<decltype(d64)> d32; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +#else + return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)}; +#endif +} + +// Signed +template <size_t N> +HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + // Same as signed ==; avoid duplicating the SSSE3 version. + const DFromV<decltype(a)> d; + RebindToUnsigned<decltype(d)> du; + return RebindMask(d, BitCast(du, a) == BitCast(du, b)); +} + +// Float +template <size_t N> +HWY_API Mask128<float, N> operator==(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<double, N> operator==(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)}; +} + +// ------------------------------ Inequality + +// This cannot have T as a template argument, otherwise it is not more +// specialized than rewritten operator== in C++20, leading to compile +// errors: https://gcc.godbolt.org/z/xsrPhPvPT. +template <size_t N> +HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Not(a == b); +} +template <size_t N> +HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Not(a == b); +} + +template <size_t N> +HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)}; +} + +// ------------------------------ Strict inequality + +namespace detail { + +template <size_t N> +HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { + return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)}; +} + +template <size_t N> +HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/, + const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + // See https://stackoverflow.com/questions/65166174/: + const DFromV<decltype(a)> d; + const RepartitionToNarrow<decltype(d)> d32; + const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw}; + const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw}; + // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper: + // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0. + const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw; + // Duplicate upper to lower half. + return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))}; +#else + return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 +#endif +} + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a, + Vec128<T, N> b) { + const DFromV<decltype(a)> du; + const RebindToSigned<decltype(du)> di; + const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1); + const auto sa = BitCast(di, Xor(a, msb)); + const auto sb = BitCast(di, Xor(b, msb)); + return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); +} + +template <size_t N> +HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a, + Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a, + Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Gt(hwy::TypeTag<T>(), a, b); +} + +// ------------------------------ Weak inequality + +namespace detail { +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a, + Vec128<T, N> b) { + return Not(Gt(tag, b, a)); +} + +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a, + Vec128<T, N> b) { + return Not(Gt(tag, b, a)); +} + +template <size_t N> +HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a, + Vec128<float, N> b) { + return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a, + Vec128<double, N> b) { + return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { + return detail::Ge(hwy::TypeTag<T>(), a, b); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Reversed comparisons + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { + return b > a; +} + +template <typename T, size_t N> +HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { + return b >= a; +} + +// ------------------------------ Iota (Load) + +namespace detail { + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm_set_epi8( + static_cast<char>(15), static_cast<char>(14), static_cast<char>(13), + static_cast<char>(12), static_cast<char>(11), static_cast<char>(10), + static_cast<char>(9), static_cast<char>(8), static_cast<char>(7), + static_cast<char>(6), static_cast<char>(5), static_cast<char>(4), + static_cast<char>(3), static_cast<char>(2), static_cast<char>(1), + static_cast<char>(0))}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, + int16_t{3}, int16_t{2}, int16_t{1}, + int16_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{ + _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm_set_pd(1.0, 0.0)}; +} + +#if HWY_COMPILER_MSVC +template <class V, HWY_IF_V_SIZE_V(V, 1)> +static HWY_INLINE V MaskOutVec128Iota(V v) { + const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)}; + return v & mask_out_mask; +} +template <class V, HWY_IF_V_SIZE_V(V, 2)> +static HWY_INLINE V MaskOutVec128Iota(V v) { +#if HWY_TARGET <= HWY_SSE4 + return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)}; +#else + const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)}; + return v & mask_out_mask; +#endif +} +template <class V, HWY_IF_V_SIZE_V(V, 4)> +static HWY_INLINE V MaskOutVec128Iota(V v) { + const DFromV<decltype(v)> d; + const Repartition<float, decltype(d)> df; + using VF = VFromD<decltype(df)>; + return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)}); +} +template <class V, HWY_IF_V_SIZE_V(V, 8)> +static HWY_INLINE V MaskOutVec128Iota(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)}); +} +template <class V, HWY_IF_V_SIZE_GT_V(V, 8)> +static HWY_INLINE V MaskOutVec128Iota(V v) { + return v; +} +#endif + +} // namespace detail + +template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> Iota(D d, const T2 first) { + const auto result_iota = detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); +#if HWY_COMPILER_MSVC + return detail::MaskOutVec128Iota(result_iota); +#else + return result_iota; +#endif +} + +// ------------------------------ FirstN (Iota, Lt) + +template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API M FirstN(D d, size_t num) { +#if HWY_TARGET <= HWY_AVX3 + constexpr size_t kN = MaxLanes(d); +#if HWY_ARCH_X86_64 + const uint64_t all = (1ull << kN) - 1; + // BZHI only looks at the lower 8 bits of n! + return M::FromBits((num > 255) ? all : _bzhi_u64(all, num)); +#else + const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1); + // BZHI only looks at the lower 8 bits of n! + return M::FromBits((num > 255) ? all + : _bzhi_u32(all, static_cast<uint32_t>(num))); +#endif // HWY_ARCH_X86_64 +#else // HWY_TARGET > HWY_AVX3 + const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. + using TI = TFromD<decltype(di)>; + return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num))); +#endif // HWY_TARGET <= HWY_AVX3 +} + +// ================================================== MEMORY (2) + +// ------------------------------ MaskedLoad + +#if HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_maskz_loadu_epi8(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_maskz_loadu_epi16(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_maskz_loadu_epi32(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_maskz_loadu_epi64(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const float* HWY_RESTRICT p) { + return VFromD<D>{_mm_maskz_loadu_ps(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const double* HWY_RESTRICT p) { + return VFromD<D>{_mm_maskz_loadu_pd(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_mask_loadu_epi8(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_mask_loadu_epi16(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_mask_loadu_epi32(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + return VFromD<D>{_mm_mask_loadu_epi64(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, + const float* HWY_RESTRICT p) { + return VFromD<D>{_mm_mask_loadu_ps(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, + const double* HWY_RESTRICT p) { + return VFromD<D>{_mm_mask_loadu_pd(v.raw, m.raw, p)}; +} + +#elif HWY_TARGET == HWY_AVX2 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + auto p_p = reinterpret_cast<const int*>(p); // NOLINT + return VFromD<D>{_mm_maskload_epi32(p_p, m.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, + const TFromD<D>* HWY_RESTRICT p) { + auto p_p = reinterpret_cast<const long long*>(p); // NOLINT + return VFromD<D>{_mm_maskload_epi64(p_p, m.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const float* HWY_RESTRICT p) { + const RebindToSigned<decltype(d)> di; + return VFromD<D>{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const double* HWY_RESTRICT p) { + const RebindToSigned<decltype(d)> di; + return VFromD<D>{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)}; +} + +// There is no maskload_epi8/16, so blend instead. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, + const TFromD<D>* HWY_RESTRICT p) { + return IfThenElseZero(m, LoadU(d, p)); +} + +#else // <= SSE4 + +// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>> +HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) { + return IfThenElseZero(m, LoadU(d, p)); +} + +#endif + +// ------------------------------ MaskedLoadOr + +#if HWY_TARGET > HWY_AVX3 // else: native + +// For all vector widths +template <class D, typename T = TFromD<D>> +HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, + const T* HWY_RESTRICT p) { + return IfThenElse(m, LoadU(d, p), v); +} + +#endif // HWY_TARGET > HWY_AVX3 + +// ------------------------------ BlendedStore + +namespace detail { + +// There is no maskload_epi8/16 with which we could safely implement +// BlendedStore. Manual blending is also unsafe because loading a full vector +// that crosses the array end causes asan faults. Resort to scalar code; the +// caller should instead use memcpy, assuming m is FirstN(d, n). +template <class D> +HWY_API void ScalarMaskedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t. + using TI = TFromD<decltype(di)>; + alignas(16) TI buf[MaxLanes(d)]; + alignas(16) TI mask[MaxLanes(d)]; + Store(BitCast(di, v), di, buf); + Store(BitCast(di, VecFromMask(d, m)), di, mask); + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (mask[i]) { + CopySameSize(buf + i, p + i); + } + } +} +} // namespace detail + +#if HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, + TFromD<D>* HWY_RESTRICT p) { + _mm_mask_storeu_epi8(p, m.raw, v.raw); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, + TFromD<D>* HWY_RESTRICT p) { + _mm_mask_storeu_epi16(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, + TFromD<D>* HWY_RESTRICT p) { + auto pi = reinterpret_cast<int*>(p); // NOLINT + _mm_mask_storeu_epi32(pi, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, + TFromD<D>* HWY_RESTRICT p) { + auto pi = reinterpret_cast<long long*>(p); // NOLINT + _mm_mask_storeu_epi64(pi, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, float* HWY_RESTRICT p) { + _mm_mask_storeu_ps(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, double* HWY_RESTRICT p) { + _mm_mask_storeu_pd(p, m.raw, v.raw); +} + +#elif HWY_TARGET == HWY_AVX2 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + detail::ScalarMaskedStore(v, m, d, p); +} + +namespace detail { + +template <class D, class V, class M, HWY_IF_UI32_D(D)> +HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) { + auto pi = reinterpret_cast<int*>(p); // NOLINT + _mm_maskstore_epi32(pi, m.raw, v.raw); +} + +template <class D, class V, class M, HWY_IF_UI64_D(D)> +HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) { + auto pi = reinterpret_cast<long long*>(p); // NOLINT + _mm_maskstore_epi64(pi, m.raw, v.raw); +} + +template <class D, class V, class M, HWY_IF_F32_D(D)> +HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) { + _mm_maskstore_ps(p, m.raw, v.raw); +} + +template <class D, class V, class M, HWY_IF_F64_D(D)> +HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) { + _mm_maskstore_pd(p, m.raw, v.raw); +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + const RebindToSigned<decltype(d)> di; + // For partial vectors, avoid writing other lanes by zeroing their mask. + if (d.MaxBytes() < 16) { + const Full128<TFromD<D>> dfull; + const Mask128<TFromD<D>> mfull{m.raw}; + m = MFromD<D>{And(mfull, FirstN(dfull, MaxLanes(d))).raw}; + } + + // Float/double require, and unsigned ints tolerate, signed int masks. + detail::NativeBlendedStore<D>(v, RebindMask(di, m), p); +} + +#else // <= SSE4 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT p) { + // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). + detail::ScalarMaskedStore(v, m, d, p); +} + +#endif // SSE4 + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)}; +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> operator+(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, + const Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, + const Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> operator-(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +template <size_t N> +HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) { + return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())}; +} + +#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF +#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF +#else +#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF +#endif + +template <size_t N> +HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)}; +} + +#if HWY_TARGET <= HWY_AVX3 +#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB +#undef HWY_NATIVE_I32_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I32_SATURATED_ADDSUB +#endif + +#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB +#undef HWY_NATIVE_I64_SATURATED_ADDSUB +#else +#define HWY_NATIVE_I64_SATURATED_ADDSUB +#endif + +template <size_t N> +HWY_API Vec128<int32_t, N> SaturatedAdd(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + const DFromV<decltype(a)> d; + const auto sum = a + b; + const auto overflow_mask = MaskFromVec( + Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); + const auto i32_max = Set(d, LimitsMax<int32_t>()); + const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32( + i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, sum); +} + +template <size_t N> +HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + const DFromV<decltype(a)> d; + const auto sum = a + b; + const auto overflow_mask = MaskFromVec( + Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); + const auto i64_max = Set(d, LimitsMax<int64_t>()); + const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64( + i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, sum); +} +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)}; +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, + const Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)}; +} + +#if HWY_TARGET <= HWY_AVX3 +template <size_t N> +HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + const DFromV<decltype(a)> d; + const auto diff = a - b; + const auto overflow_mask = MaskFromVec( + Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); + const auto i32_max = Set(d, LimitsMax<int32_t>()); + const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32( + i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, diff); +} + +template <size_t N> +HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + const DFromV<decltype(a)> d; + const auto diff = a - b; + const auto overflow_mask = MaskFromVec( + Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); + const auto i64_max = Set(d, LimitsMax<int64_t>()); + const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64( + i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, diff); +} +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ AverageRound + +// Returns (a + b + 1) / 2 + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, + const Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)}; +} + +// ------------------------------ Integer multiplication + +template <size_t N> +HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +template <size_t N> +HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, + const Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +template <size_t N> +HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { + return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)}; +} + +#if HWY_TARGET >= HWY_SSSE3 + +template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> // N=1 or 2 +HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + const DFromV<decltype(a)> d; + const RepartitionToWide<decltype(d)> dw; + return Set(dw, static_cast<int64_t>(GetLane(a)) * GetLane(b)); +} +HWY_API Vec128<int64_t> MulEven(Vec128<int32_t> a, Vec128<int32_t> b) { + alignas(16) int32_t a_lanes[4]; + alignas(16) int32_t b_lanes[4]; + const DFromV<decltype(a)> di32; + const RepartitionToWide<decltype(di32)> di64; + Store(a, di32, a_lanes); + Store(b, di32, b_lanes); + alignas(16) int64_t mul[2]; + mul[0] = static_cast<int64_t>(a_lanes[0]) * b_lanes[0]; + mul[1] = static_cast<int64_t>(a_lanes[2]) * b_lanes[2]; + return Load(di64, mul); +} + +#else // HWY_TARGET < HWY_SSSE3 + +template <size_t N> +HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)}; +} + +#endif // HWY_TARGET >= HWY_SSSE3 + +template <size_t N> +HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, + const Vec128<uint32_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. + // 64-bit right shift would also work but also needs port 5, so no benefit. + // Notation: x=don't care, z=0. + const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); + const auto mullo_x2x0 = MulEven(a, b); + const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); + const auto mullo_x3x1 = + MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1}); + // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating + // the latter requires one more instruction or a constant. + const __m128i mul_20 = + _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); + const __m128i mul_31 = + _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); + return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)}; +#else + return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)}; +#endif +} + +template <size_t N> +HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, + const Vec128<int32_t, N> b) { + // Same as unsigned; avoid duplicating the SSSE3 code. + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, BitCast(du, a) * BitCast(du, b)); +} + +// ------------------------------ RotateRight (ShiftRight, Or) + +template <int kBits, typename T, size_t N, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + // AVX3 does not support 8/16-bit. + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +template <int kBits, size_t N> +HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v)); +#endif +} + +template <int kBits, size_t N> +HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v)); +#endif +} + +// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) + +template <size_t N> +HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + return VecFromMask(v < Zero(d)); +} + +template <size_t N> +HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) { + return ShiftRight<15>(v); +} + +template <size_t N> +HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) { + return ShiftRight<31>(v); +} + +template <size_t N> +HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) { + const DFromV<decltype(v)> d; +#if HWY_TARGET <= HWY_AVX3 + (void)d; + return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)}; +#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 + return VecFromMask(v < Zero(d)); +#else + // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift + // avoids generating a zero. + const RepartitionToNarrow<decltype(d)> d32; + const auto sign = ShiftRight<31>(BitCast(d32, v)); + return Vec128<int64_t, N>{ + _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +#endif +} + +// ------------------------------ Integer Abs + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +template <size_t N> +HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { +#if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2 + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto zero = Zero(du); + const auto v_as_u8 = BitCast(du, v); + return BitCast(d, Min(v_as_u8, zero - v_as_u8)); +#else + return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)}; +#endif +} + +template <size_t N> +HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { +#if HWY_TARGET == HWY_SSE2 + const auto zero = Zero(DFromV<decltype(v)>()); + return Max(v, zero - v); +#else + return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)}; +#endif +} + +template <size_t N> +HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { +#if HWY_TARGET <= HWY_SSSE3 + return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)}; +#else + const auto zero = Zero(DFromV<decltype(v)>()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +template <size_t N> +HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)}; +#else + const auto zero = Zero(DFromV<decltype(v)>()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +// GCC and older Clang do not follow the Intel documentation for AVX-512VL +// srli_epi64: the count should be unsigned int. Note that this is not the same +// as the Shift3264Count in x86_512-inl.h (GCC also requires int). +#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || HWY_COMPILER_GCC_ACTUAL +using Shift64Count = int; +#else +// Assume documented behavior. Clang 12 and MSVC 14.28.29910 match this. +using Shift64Count = unsigned int; +#endif + +template <int kBits, size_t N> +HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<int64_t, N>{ + _mm_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))}; +#else + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); + const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); + return right | sign; +#endif +} + +// ------------------------------ ZeroIfNegative (BroadcastSignBit) +template <typename T, size_t N> +HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only works for float"); + const DFromV<decltype(v)> d; +#if HWY_TARGET >= HWY_SSSE3 + const RebindToSigned<decltype(d)> di; + const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); +#else + const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS +#endif + return IfThenElse(mask, Zero(d), v); +} + +// ------------------------------ IfNegativeThenElse +template <size_t N> +HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v, + const Vec128<int8_t, N> yes, + const Vec128<int8_t, N> no) { + // int8: IfThenElse only looks at the MSB. + return IfThenElse(MaskFromVec(v), yes, no); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, + Vec128<T, N> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + + // 16-bit: no native blendv, so copy sign to lower byte's MSB. + v = BitCast(d, BroadcastSignBit(BitCast(di, v))); + return IfThenElse(MaskFromVec(v), yes, no); +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 2)> +HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, + Vec128<T, N> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + const DFromV<decltype(v)> d; + const RebindToFloat<decltype(d)> df; + + // 32/64-bit: use float IfThenElse, which only looks at the MSB. + return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes), + BitCast(df, no))); +} + +// ------------------------------ ShiftLeftSame + +template <size_t N> +HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)}; + } +#endif + return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)}; + } +#endif + return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)}; + } +#endif + return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <size_t N> +HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)}; + } +#endif + return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <size_t N> +HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)}; + } +#endif + return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <size_t N> +HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)}; + } +#endif + return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<T, N> shifted{ + ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw}; + return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); +} + +// ------------------------------ ShiftRightSame (BroadcastSignBit) + +template <size_t N> +HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)}; + } +#endif + return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)}; + } +#endif + return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)}; + } +#endif + return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <size_t N> +HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v, + const int bits) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec128<uint8_t, N> shifted{ + ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw}; + return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits)); +} + +template <size_t N> +HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)}; + } +#endif + return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <size_t N> +HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)}; + } +#endif + return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v, + const int bits) { +#if HWY_TARGET <= HWY_AVX3 +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec128<int64_t, N>{ + _mm_srai_epi64(v.raw, static_cast<Shift64Count>(bits))}; + } +#endif + return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +#else + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); + return right | sign; +#endif +} + +template <size_t N> +HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = + BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits))); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Floating-point mul / div + +template <size_t N> +HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) { + return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)}; +} +HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a, + const Vec128<float, 1> b) { + return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> operator*(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)}; +} +HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) { + return Vec64<double>{_mm_mul_sd(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, + const Vec128<float, N> b) { + return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)}; +} +HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a, + const Vec128<float, 1> b) { + return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> operator/(const Vec128<double, N> a, + const Vec128<double, N> b) { + return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)}; +} +HWY_API Vec64<double> operator/(const Vec64<double> a, const Vec64<double> b) { + return Vec64<double>{_mm_div_sd(a.raw, b.raw)}; +} + +// Approximate reciprocal +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { + return Vec128<float, N>{_mm_rcp_ps(v.raw)}; +} +HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) { + return Vec128<float, 1>{_mm_rcp_ss(v.raw)}; +} + +// Absolute value of difference. +template <size_t N> +HWY_API Vec128<float, N> AbsDiff(Vec128<float, N> a, Vec128<float, N> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +template <size_t N> +HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> add) { +#if HWY_TARGET >= HWY_SSE4 + return mul * x + add; +#else + return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x, + Vec128<double, N> add) { +#if HWY_TARGET >= HWY_SSE4 + return mul * x + add; +#else + return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns add - mul * x +template <size_t N> +HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> add) { +#if HWY_TARGET >= HWY_SSE4 + return add - mul * x; +#else + return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x, + Vec128<double, N> add) { +#if HWY_TARGET >= HWY_SSE4 + return add - mul * x; +#else + return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns mul * x - sub +template <size_t N> +HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> sub) { +#if HWY_TARGET >= HWY_SSE4 + return mul * x - sub; +#else + return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x, + Vec128<double, N> sub) { +#if HWY_TARGET >= HWY_SSE4 + return mul * x - sub; +#else + return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// Returns -mul * x - sub +template <size_t N> +HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x, + Vec128<float, N> sub) { +#if HWY_TARGET >= HWY_SSE4 + return Neg(mul) * x - sub; +#else + return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x, + Vec128<double, N> sub) { +#if HWY_TARGET >= HWY_SSE4 + return Neg(mul) * x - sub; +#else + return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// ------------------------------ Floating-point square root + +// Full precision square root +template <size_t N> +HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) { + return Vec128<float, N>{_mm_sqrt_ps(v.raw)}; +} +HWY_API Vec128<float, 1> Sqrt(Vec128<float, 1> v) { + return Vec128<float, 1>{_mm_sqrt_ss(v.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) { + return Vec128<double, N>{_mm_sqrt_pd(v.raw)}; +} +HWY_API Vec64<double> Sqrt(Vec64<double> v) { + return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; +} + +// Approximate reciprocal square root +template <size_t N> +HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { + return Vec128<float, N>{_mm_rsqrt_ps(v.raw)}; +} +HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(Vec128<float, 1> v) { + return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)}; +} + +// ------------------------------ Min (Gt, IfThenElse) + +namespace detail { + +template <typename T, size_t N> +HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a, + const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; + const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, b, a); +} + +} // namespace detail + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return detail::MinU(a, b); +#else + return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return detail::MinU(a, b); +#else + return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)}; +#else + return detail::MinU(a, b); +#endif +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return IfThenElse(a < b, a, b); +#else + return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return IfThenElse(a < b, a, b); +#else + return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, a, b); +#endif +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) { + return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) { + return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)}; +} + +// ------------------------------ Max (Gt, IfThenElse) + +namespace detail { +template <typename T, size_t N> +HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a, + const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; + const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, a, b); +} + +} // namespace detail + +// Unsigned +template <size_t N> +HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return detail::MaxU(a, b); +#else + return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return detail::MaxU(a, b); +#else + return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)}; +#else + return detail::MaxU(a, b); +#endif +} + +// Signed +template <size_t N> +HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return IfThenElse(a < b, b, a); +#else + return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + return IfThenElse(a < b, b, a); +#else + return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)}; +#endif +} +template <size_t N> +HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, b, a); +#endif +} + +// Float +template <size_t N> +HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) { + return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) { + return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)}; +} + +// ================================================== MEMORY (3) + +// ------------------------------ Non-temporal stores + +// On clang6, we see incorrect code generated for _mm_stream_pi, so +// round even partial vectors up to 16 bytes. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_D(D)> +HWY_API void Stream(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { + _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) { + _mm_stream_ps(aligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) { + _mm_stream_pd(aligned, v.raw); +} + +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +// Unfortunately the GCC/Clang intrinsics do not accept int64_t*. +using GatherIndex64 = long long int; // NOLINT(runtime/int) +static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); + +#if HWY_TARGET <= HWY_AVX3 +namespace detail { + +template <int kScale, class D, class VI, HWY_IF_UI32_D(D)> +HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, + VI index) { + if (d.MaxBytes() == 16) { + _mm_i32scatter_epi32(base, index.raw, v.raw, kScale); + } else { + const __mmask8 mask = (1u << MaxLanes(d)) - 1; + _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale); + } +} + +template <int kScale, class D, class VI, HWY_IF_UI64_D(D)> +HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, + VI index) { + if (d.MaxBytes() == 16) { + _mm_i64scatter_epi64(base, index.raw, v.raw, kScale); + } else { + const __mmask8 mask = (1u << MaxLanes(d)) - 1; + _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale); + } +} + +template <int kScale, class D, class VI, HWY_IF_F32_D(D)> +HWY_INLINE void NativeScatter128(VFromD<D> v, D d, float* HWY_RESTRICT base, + VI index) { + if (d.MaxBytes() == 16) { + _mm_i32scatter_ps(base, index.raw, v.raw, kScale); + } else { + const __mmask8 mask = (1u << MaxLanes(d)) - 1; + _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale); + } +} + +template <int kScale, class D, class VI, HWY_IF_F64_D(D)> +HWY_INLINE void NativeScatter128(VFromD<D> v, D d, double* HWY_RESTRICT base, + VI index) { + if (d.MaxBytes() == 16) { + _mm_i64scatter_pd(base, index.raw, v.raw, kScale); + } else { + const __mmask8 mask = (1u << MaxLanes(d)) - 1; + _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale); + } +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) { + static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match"); + return detail::NativeScatter128<1>(v, d, base, offset); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) { + static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match"); + return detail::NativeScatter128<sizeof(T)>(v, d, base, index); +} + +#else // HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + alignas(16) TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + alignas(16) TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + for (size_t i = 0; i < MaxLanes(d); ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +#endif + +// ------------------------------ Gather (Load/Store) + +#if HWY_TARGET >= HWY_SSE4 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) TI offset_lanes[MaxLanes(d)]; + Store(offset, Rebind<TI, decltype(d)>(), offset_lanes); + + alignas(16) T lanes[MaxLanes(d)]; + const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) { + using TI = TFromV<VI>; + static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); + + alignas(16) TI index_lanes[MaxLanes(d)]; + Store(index, Rebind<TI, decltype(d)>(), index_lanes); + + alignas(16) T lanes[MaxLanes(d)]; + for (size_t i = 0; i < MaxLanes(d); ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +#else + +namespace detail { + +template <int kScale, class D, class VI, HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> NativeGather128(D /* tag */, + const TFromD<D>* HWY_RESTRICT base, + VI index) { + return VFromD<D>{_mm_i32gather_epi32(reinterpret_cast<const int32_t*>(base), + index.raw, kScale)}; +} + +template <int kScale, class D, class VI, HWY_IF_UI64_D(D)> +HWY_INLINE VFromD<D> NativeGather128(D /* tag */, + const TFromD<D>* HWY_RESTRICT base, + VI index) { + return VFromD<D>{_mm_i64gather_epi64( + reinterpret_cast<const GatherIndex64*>(base), index.raw, kScale)}; +} + +template <int kScale, class D, class VI, HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> NativeGather128(D /* tag */, + const float* HWY_RESTRICT base, VI index) { + return VFromD<D>{_mm_i32gather_ps(base, index.raw, kScale)}; +} + +template <int kScale, class D, class VI, HWY_IF_F64_D(D)> +HWY_INLINE VFromD<D> NativeGather128(D /* tag */, + const double* HWY_RESTRICT base, + VI index) { + return VFromD<D>{_mm_i64gather_pd(base, index.raw, kScale)}; +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) { + static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match"); + return detail::NativeGather128<1>(d, base, offset); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI> +HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) { + static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match"); + return detail::NativeGather128<sizeof(T)>(d, base, index); +} + +#endif // HWY_TARGET >= HWY_SSE4 + +HWY_DIAGNOSTICS(pop) + +// ================================================== SWIZZLE (2) + +// ------------------------------ LowerHalf + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { + return VFromD<D>{v.raw}; +} +template <typename T, size_t N> +HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { + return Vec128<T, N / 2>{v.raw}; +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const RebindToUnsigned<decltype(d)> du; + return BitCast( + d, VFromD<decltype(du)>{_mm_slli_si128(BitCast(du, v).raw, kBytes)}); +} + +template <int kBytes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) { + return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftLeftLanes + +template <int kLanes, class D, typename T = TFromD<D>, + HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); +} + +template <int kLanes, typename T, size_t N> +HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) { + return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const RebindToUnsigned<decltype(d)> du; + // For partial vectors, clear upper lanes so we shift in zeros. + if (d.MaxBytes() != 16) { + const Full128<TFromD<D>> dfull; + const VFromD<decltype(dfull)> vfull{v.raw}; + v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; + } + return BitCast( + d, VFromD<decltype(du)>{_mm_srli_si128(BitCast(du, v).raw, kBytes)}); +} + +// ------------------------------ ShiftRightLanes +template <int kLanes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) { + const Repartition<uint8_t, decltype(d)> d8; + constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); + return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +// Full input: copy hi into lo (smaller instruction encoding than shifts). +template <class D, typename T = TFromD<D>> +HWY_API Vec64<T> UpperHalf(D /* tag */, Vec128<T> v) { + return Vec64<T>{_mm_unpackhi_epi64(v.raw, v.raw)}; +} +template <class D> +HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) { + return Vec64<float>{_mm_movehl_ps(v.raw, v.raw)}; +} +template <class D> +HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) { + return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)}; +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> +HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { + return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); +} + +// ------------------------------ ExtractLane (UpperHalf) + +namespace detail { + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 + const int pair = _mm_extract_epi16(v.raw, kLane / 2); + constexpr int kShift = kLane & 1 ? 8 : 0; + return static_cast<T>((pair >> kShift) & 0xFF); +#else + return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF); +#endif +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + static_assert(kLane < N, "Lane index out of bounds"); + return static_cast<T>(_mm_extract_epi16(v.raw, kLane) & 0xFFFF); +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 + alignas(16) T lanes[4]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[kLane]; +#else + return static_cast<T>(_mm_extract_epi32(v.raw, kLane)); +#endif +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE T ExtractLane(const Vec128<T, N> v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_ARCH_X86_32 + alignas(16) T lanes[2]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[kLane]; +#elif HWY_TARGET >= HWY_SSSE3 + return static_cast<T>( + _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE))); +#else + return static_cast<T>(_mm_extract_epi64(v.raw, kLane)); +#endif +} + +template <size_t kLane, size_t N> +HWY_INLINE float ExtractLane(const Vec128<float, N> v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 + alignas(16) float lanes[4]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[kLane]; +#else + // Bug in the intrinsic, returns int but should be float. + const int32_t bits = _mm_extract_ps(v.raw, kLane); + float ret; + CopySameSize(&bits, &ret); + return ret; +#endif +} + +// There is no extract_pd; two overloads because there is no UpperHalf for N=1. +template <size_t kLane> +HWY_INLINE double ExtractLane(const Vec128<double, 1> v) { + static_assert(kLane == 0, "Lane index out of bounds"); + return GetLane(v); +} + +template <size_t kLane> +HWY_INLINE double ExtractLane(const Vec128<double> v) { + static_assert(kLane < 2, "Lane index out of bounds"); + const Half<DFromV<decltype(v)>> dh; + return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v)); +} + +} // namespace detail + +// Requires one overload per vector length because ExtractLane<3> may be a +// compile error if it calls _mm_extract_epi64. +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return GetLane(v); +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + } + } +#endif + alignas(16) T lanes[2]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + } + } +#endif + alignas(16) T lanes[4]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + } + } +#endif + alignas(16) T lanes[8]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +template <typename T> +HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + case 8: + return detail::ExtractLane<8>(v); + case 9: + return detail::ExtractLane<9>(v); + case 10: + return detail::ExtractLane<10>(v); + case 11: + return detail::ExtractLane<11>(v); + case 12: + return detail::ExtractLane<12>(v); + case 13: + return detail::ExtractLane<13>(v); + case 14: + return detail::ExtractLane<14>(v); + case 15: + return detail::ExtractLane<15>(v); + } + } +#endif + alignas(16) T lanes[16]; + Store(v, DFromV<decltype(v)>(), lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane (UpperHalf) + +namespace detail { + +template <class V> +HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) { + const DFromV<decltype(v)> d; + +#if HWY_TARGET <= HWY_AVX3 + using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw); + const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)}; +#else + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i))); +#endif + + return IfThenElse(mask, Set(d, t), v); +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 + return InsertLaneUsingBroadcastAndBlend(v, kLane, t); +#else + return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)}; +#endif +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128<T, N>{_mm_insert_epi16(v.raw, t, kLane)}; +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 + return InsertLaneUsingBroadcastAndBlend(v, kLane, t); +#else + MakeSigned<T> ti; + CopySameSize(&t, &ti); // don't just cast because T might be float. + return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)}; +#endif +} + +template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32 + const DFromV<decltype(v)> d; + const RebindToFloat<decltype(d)> df; + const auto vt = BitCast(df, Set(d, t)); + if (kLane == 0) { + return BitCast( + d, Vec128<double, N>{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)}); + } + return BitCast( + d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)}); +#else + MakeSigned<T> ti; + CopySameSize(&t, &ti); // don't just cast because T might be float. + return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)}; +#endif +} + +template <size_t kLane, size_t N> +HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET >= HWY_SSSE3 + return InsertLaneUsingBroadcastAndBlend(v, kLane, t); +#else + return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)}; +#endif +} + +// There is no insert_pd; two overloads because there is no UpperHalf for N=1. +template <size_t kLane> +HWY_INLINE Vec128<double, 1> InsertLane(const Vec128<double, 1> v, double t) { + static_assert(kLane == 0, "Lane index out of bounds"); + return Set(DFromV<decltype(v)>(), t); +} + +template <size_t kLane> +HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) { + static_assert(kLane < 2, "Lane index out of bounds"); + const DFromV<decltype(v)> d; + const Vec128<double> vt = Set(d, t); + if (kLane == 0) { + return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)}; + } + return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)}; +} + +} // namespace detail + +// Requires one overload per vector length because InsertLane<3> may be a +// compile error if it calls _mm_insert_epi64. + +template <typename T> +HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + return Set(DFromV<decltype(v)>(), t); +} + +template <typename T> +HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + } + } +#endif + return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); +} + +template <typename T> +HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + } + } +#endif + return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); +} + +template <typename T> +HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + } + } +#endif + return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); +} + +template <typename T> +HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + case 8: + return detail::InsertLane<8>(v, t); + case 9: + return detail::InsertLane<9>(v, t); + case 10: + return detail::InsertLane<10>(v, t); + case 11: + return detail::InsertLane<11>(v, t); + case 12: + return detail::InsertLane<12>(v, t); + case 13: + return detail::InsertLane<13>(v, t); + case 14: + return detail::InsertLane<14>(v, t); + case 15: + return detail::InsertLane<15>(v, t); + } + } +#endif + return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); +} + +// ------------------------------ CombineShiftRightBytes + +#if HWY_TARGET == HWY_SSE2 +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec128<T> CombineShiftRightBytes(D d, Vec128<T> hi, Vec128<T> lo) { + static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); + return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); +} +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kSize = d.MaxBytes(); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + + const Twice<decltype(d)> dt; + return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw}; +} +#else +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec128<T> CombineShiftRightBytes(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8( + BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); +} + +template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { + constexpr size_t kSize = d.MaxBytes(); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition<uint8_t, decltype(d)> d8; + using V8 = Vec128<uint8_t>; + const DFromV<V8> dfull8; + const Repartition<TFromD<D>, decltype(dfull8)> dfull; + const V8 hi8{BitCast(d8, hi).raw}; + // Move into most-significant bytes + const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); + const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); + return VFromD<D>{BitCast(dfull, r).raw}; +} +#endif + +// ------------------------------ Broadcast/splat any lane + +// Unsigned +template <int kLane, size_t N> +HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + if (kLane < 4) { + const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)}; + } else { + const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)}; + } +} +template <int kLane, size_t N> +HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template <int kLane, size_t N> +HWY_API Vec128<uint64_t, N> Broadcast(const Vec128<uint64_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Signed +template <int kLane, size_t N> +HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + if (kLane < 4) { + const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)}; + } else { + const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)}; + } +} +template <int kLane, size_t N> +HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template <int kLane, size_t N> +HWY_API Vec128<int64_t, N> Broadcast(const Vec128<int64_t, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Float +template <int kLane, size_t N> +HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; +} +template <int kLane, size_t N> +HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; +} + +// ------------------------------ TableLookupLanes (Shuffle01) + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template <typename T, size_t N = 16 / sizeof(T)> +struct Indices128 { + __m128i raw; +}; + +template <class D, typename T = TFromD<D>, typename TI, size_t kN, + HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)> +HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind<TI, decltype(d)> di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, kN * 2)))); +#endif + + // No change as byte indices are always used for 8-bit lane types + (void)d; + return Indices128<T, kN>{vec.raw}; +} + +template <class D, typename T = TFromD<D>, typename TI, size_t kN, + HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)> +HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind<TI, decltype(d)> di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, kN * 2)))); +#endif + +#if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 + (void)d; + return Indices128<T, kN>{vec.raw}; +#else // SSSE3, SSE4, or AVX2 + const Repartition<uint8_t, decltype(d)> d8; + using V8 = VFromD<decltype(d8)>; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + + // Broadcast each lane index to all 4 bytes of T + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); + + // Shift to bytes + const Repartition<uint16_t, decltype(d)> d16; + const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); + + return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; +#endif // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 +} + +template <class D, typename T = TFromD<D>, typename TI, size_t kN, + HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)> +HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind<TI, decltype(d)> di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, kN * 2)))); +#endif + +#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 + (void)d; + return Indices128<T, kN>{vec.raw}; +#else + const Repartition<uint8_t, decltype(d)> d8; + using V8 = VFromD<decltype(d8)>; + alignas(16) static constexpr uint8_t kByteOffsets[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + + // Broadcast each lane index to all 4 bytes of T + alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); + + // Shift to bytes + const Repartition<uint16_t, decltype(d)> d16; + const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); + + return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; +#endif +} + +template <class D, typename T = TFromD<D>, typename TI, size_t kN, + HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)> +HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind<TI, decltype(d)> di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast<TI>(kN * 2))))); +#else + (void)d; +#endif + + // No change - even without AVX3, we can shuffle+blend. + return Indices128<T, kN>{vec.raw}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI> +HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( + D d, const TI* idx) { + static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { + return TableLookupBytes(v, Vec128<T, N>{idx.raw}); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { +#if HWY_TARGET <= HWY_AVX3 + return {_mm_permutexvar_epi16(idx.raw, v.raw)}; +#elif HWY_TARGET == HWY_SSE2 +#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) + typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16))); + return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>( + __builtin_shuffle(reinterpret_cast<GccU16RawVectType>(v.raw), + reinterpret_cast<GccU16RawVectType>(idx.raw)))}; +#else + const Full128<T> d_full; + alignas(16) T src_lanes[8]; + alignas(16) uint16_t indices[8]; + alignas(16) T result_lanes[8]; + + Store(Vec128<T>{v.raw}, d_full, src_lanes); + _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); + + for (int i = 0; i < 8; i++) { + result_lanes[i] = src_lanes[indices[i] & 7u]; + } + + return Vec128<T, N>{Load(d_full, result_lanes).raw}; +#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) +#else + return TableLookupBytes(v, Vec128<T, N>{idx.raw}); +#endif +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { +#if HWY_TARGET <= HWY_AVX2 + const DFromV<decltype(v)> d; + const RebindToFloat<decltype(d)> df; + const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)}; + return BitCast(d, perm); +#elif HWY_TARGET == HWY_SSE2 +#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) + typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); + return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>( + __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v.raw), + reinterpret_cast<GccU32RawVectType>(idx.raw)))}; +#else + const Full128<T> d_full; + alignas(16) T src_lanes[4]; + alignas(16) uint32_t indices[4]; + alignas(16) T result_lanes[4]; + + Store(Vec128<T>{v.raw}, d_full, src_lanes); + _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); + + for (int i = 0; i < 4; i++) { + result_lanes[i] = src_lanes[indices[i] & 3u]; + } + + return Vec128<T, N>{Load(d_full, result_lanes).raw}; +#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) +#else // SSSE3 or SSE4 + return TableLookupBytes(v, Vec128<T, N>{idx.raw}); +#endif +} + +#if HWY_TARGET <= HWY_SSSE3 +template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)> +HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v, + Indices128<float, N> idx) { +#if HWY_TARGET <= HWY_AVX2 + return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)}; +#else // SSSE3 or SSE4 + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + return BitCast(df, + TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw})); +#endif // HWY_TARGET <= HWY_AVX2 +} +#endif // HWY_TARGET <= HWY_SSSE3 + +// Single lane: no change +template <typename T> +HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, + Indices128<T, 1> /* idx */) { + return v; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) { + const DFromV<decltype(v)> d; + Vec128<int64_t> vidx{idx.raw}; +#if HWY_TARGET <= HWY_AVX2 + // There is no _mm_permute[x]var_epi64. + vidx += vidx; // bit1 is the decider (unusual) + const RebindToFloat<decltype(d)> df; + return BitCast( + d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); +#else + // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit + // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 + // to obtain an all-zero or all-one mask. + const RebindToSigned<decltype(d)> di; + const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); + const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same)); + return IfThenElse(mask_same, v, Shuffle01(v)); +#endif +} + +HWY_API Vec128<double> TableLookupLanes(Vec128<double> v, + Indices128<double> idx) { + Vec128<int64_t> vidx{idx.raw}; +#if HWY_TARGET <= HWY_AVX2 + vidx += vidx; // bit1 is the decider (unusual) + return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)}; +#else + // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit + // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 + // to obtain an all-zero or all-one mask. + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); + const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same)); + return IfThenElse(mask_same, v, Shuffle01(v)); +#endif +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { + return v; +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301) + +// Single lane: no change +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { + return v; +} + +// 32-bit x2: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec64<T> Reverse(D /* tag */, const Vec64<T> v) { + return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw}; +} + +// 64-bit x2: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) { + return Shuffle01(v); +} + +// 32-bit x4: shuffle +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) { + return Shuffle0123(v); +} + +// 16-bit +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { + constexpr size_t kN = MaxLanes(d); + if (kN == 1) return v; + if (kN == 2) { + return VFromD<D>{_mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(0, 1, 0, 1))}; + } + if (kN == 4) { + return VFromD<D>{_mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; + } + +#if HWY_TARGET == HWY_SSE2 + const VFromD<D> rev4{ + _mm_shufflehi_epi16(_mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(0, 1, 2, 3)), + _MM_SHUFFLE(0, 1, 2, 3))}; + return VFromD<D>{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))}; +#else + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; + return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { + constexpr size_t kN = MaxLanes(d); + if (kN == 1) return v; +#if HWY_TARGET <= HWY_SSE3 + // NOTE: Lanes with negative shuffle control mask values are set to zero. + alignas(16) constexpr int8_t kReverse[16] = { + kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, + kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> idx = Load(di, kReverse); + return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)}; +#else + const RepartitionToWide<decltype(d)> d16; + return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); +#endif +} + +// ------------------------------ Reverse2 + +// Single lane: no change +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> +HWY_API Vec128<T, 1> Reverse2(D /* tag */, Vec128<T, 1> v) { + return v; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { +#if HWY_TARGET <= HWY_AVX3 + const Repartition<uint32_t, decltype(d)> du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +#elif HWY_TARGET == HWY_SSE2 + constexpr size_t kN = MaxLanes(d); + __m128i shuf_result = _mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(2, 3, 0, 1)); + if (kN > 4) { + shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1)); + } + return VFromD<D>{shuf_result}; +#else + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C}; + return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { + return Shuffle2301(v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { + // 4x 16-bit: a single shufflelo suffices. + constexpr size_t kN = MaxLanes(d); + if (kN <= 4) { + return VFromD<D>{_mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; + } + +#if HWY_TARGET == HWY_SSE2 + return VFromD<D>{ + _mm_shufflehi_epi16(_mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(0, 1, 2, 3)), + _MM_SHUFFLE(0, 1, 2, 3))}; +#else + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908}; + return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); +#endif +} + +// 32-bit, any vector size: use Shuffle0123 +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { + return Shuffle0123(v); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> +HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) { + HWY_ASSERT(0); // don't have 4 u64 lanes +} + +// ------------------------------ Reverse8 + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> +HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { +#if HWY_TARGET == HWY_SSE2 + const RepartitionToWide<decltype(d)> dw; + return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); +#else + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; + return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) { + HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit +} + +// ------------------------------ ReverseBits + +#if HWY_TARGET <= HWY_AVX3_DL + +#ifdef HWY_NATIVE_REVERSE_BITS_UI8 +#undef HWY_NATIVE_REVERSE_BITS_UI8 +#else +#define HWY_NATIVE_REVERSE_BITS_UI8 +#endif + +template <class V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> +HWY_API V ReverseBits(V v) { + const Full128<uint64_t> du64_full; + const auto affine_matrix = Set(du64_full, 0x8040201008040201u); + return V{_mm_gf2p8affine_epi64_epi8(v.raw, affine_matrix.raw, 0)}; +} +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +template <size_t N> +HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a, + Vec128<uint8_t, N> b) { + return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a, + Vec128<uint16_t, N> b) { + return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a, + Vec128<uint32_t, N> b) { + return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a, + Vec128<int8_t, N> b) { + return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a, + Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a, + Vec128<int32_t, N> b) { + return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)}; +} + +template <size_t N> +HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, + Vec128<float, N> b) { + return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a, + Vec128<double, N> b) { + return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)}; +} + +// Additional overload for the optional tag (also for 256/512). +template <class D> +HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec128<uint8_t> InterleaveUpper(Vec128<uint8_t> a, Vec128<uint8_t> b) { + return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec128<uint16_t> InterleaveUpper(Vec128<uint16_t> a, + Vec128<uint16_t> b) { + return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec128<uint32_t> InterleaveUpper(Vec128<uint32_t> a, + Vec128<uint32_t> b) { + return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec128<uint64_t> InterleaveUpper(Vec128<uint64_t> a, + Vec128<uint64_t> b) { + return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec128<int8_t> InterleaveUpper(Vec128<int8_t> a, Vec128<int8_t> b) { + return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec128<int16_t> InterleaveUpper(Vec128<int16_t> a, Vec128<int16_t> b) { + return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec128<int32_t> InterleaveUpper(Vec128<int32_t> a, Vec128<int32_t> b) { + return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) { + return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec128<float> InterleaveUpper(Vec128<float> a, Vec128<float> b) { + return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)}; +} +HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) { + return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)}; +} + +} // namespace detail + +// Full +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) { + return detail::InterleaveUpper(a, b); +} + +// Partial +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { + const Half<decltype(d)> d2; + return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, + VFromD<D>{UpperHalf(d2, b).raw}); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <class V, class DW = RepartitionToWide<DFromV<V>>> +HWY_API VFromD<DW> ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> +HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// N = N/2 + N/2 (upper half undefined) +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> +HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { + const Half<decltype(d)> dh; + const RebindToUnsigned<decltype(dh)> duh; + // Treat half-width input as one lane, and expand to two lanes. + using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; + const VU lo{BitCast(duh, lo_half).raw}; + const VU hi{BitCast(duh, hi_half).raw}; + return BitCast(d, InterleaveLower(lo, hi)); +} + +// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec128<T> ZeroExtendVector(hwy::NonFloatTag /*tag*/, D /* d */, + Vec64<T> lo) { + return Vec128<T>{_mm_move_epi64(lo.raw)}; +} + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec128<T> ZeroExtendVector(hwy::FloatTag /*tag*/, D d, Vec64<T> lo) { + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo))); +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> +HWY_API Vec128<T> ZeroExtendVector(D d, Vec64<T> lo) { + return detail::ZeroExtendVector(hwy::IsFloatTag<T>(), d, lo); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { + const Half<D> dh; + return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); +} + +// ------------------------------ Concat full (InterleaveLower) + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint64_t, decltype(d)> d64; + return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint64_t, decltype(d)> d64; + return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) { + return CombineShiftRightBytes<8>(d, hi, lo); +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> ConcatUpperLower(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<double, decltype(d)> dd; +#if HWY_TARGET >= HWY_SSSE3 + return BitCast( + d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw, + _MM_SHUFFLE2(1, 0))}); +#else + // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle. + return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw, + BitCast(dd, lo).raw, 1)}); +#endif +} +template <class D> +HWY_API Vec128<float> ConcatUpperLower(D d, Vec128<float> hi, + Vec128<float> lo) { +#if HWY_TARGET >= HWY_SSSE3 + (void)d; + return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; +#else + // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle. + const RepartitionToWide<decltype(d)> dd; + return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw, + BitCast(dd, lo).raw, 1)}); +#endif +} +template <class D> +HWY_API Vec128<double> ConcatUpperLower(D /* tag */, Vec128<double> hi, + Vec128<double> lo) { +#if HWY_TARGET >= HWY_SSSE3 + return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; +#else + // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. + return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)}; +#endif +} + +// ------------------------------ Concat partial (Combine, LowerHalf) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi, + const VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { + const Half<decltype(d)> d2; + return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); +} + +// ------------------------------ ConcatOdd + +// 8-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint16_t, decltype(d)> dw; + // Right-shift 8 bits per u16 so we can pack. + const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); + return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)}; +} + +// 8-bit x8 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> ConcatOdd(D d, Vec64<T> hi, Vec64<T> lo) { +#if HWY_TARGET == HWY_SSE2 + const Repartition<uint16_t, decltype(d)> dw; + // Right-shift 8 bits per u16 so we can pack. + const Vec64<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec64<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); + return Vec64<T>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +#else + const Repartition<uint32_t, decltype(d)> du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7}; + const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8)); + const Vec64<T> L = TableLookupBytes(lo, shuf); + const Vec64<T> H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +#endif +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatOdd(D d, Vec32<T> hi, Vec32<T> lo) { +#if HWY_TARGET == HWY_SSE2 + const Repartition<uint16_t, decltype(d)> dw; + const Twice<decltype(dw)> dw_2; + // Right-shift 8 bits per u16 so we can pack. + const Vec32<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec32<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); + const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL); + return Vec32<T>{_mm_packus_epi16(uHL.raw, uHL.raw)}; +#else + const Repartition<uint16_t, decltype(d)> du16; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactOddU8[4] = {1, 3}; + const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8)); + const Vec32<T> L = TableLookupBytes(lo, shuf); + const Vec32<T> H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); +#endif +} + +// 16-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { + // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns + // 0xFFFF8000, which correctly saturates to 0x8000. + const Repartition<int32_t, decltype(d)> dw; + const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo)); + return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)}; +} + +// 16-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ConcatOdd(D d, Vec64<T> hi, Vec64<T> lo) { +#if HWY_TARGET == HWY_SSE2 + // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns + // 0xFFFF8000, which correctly saturates to 0x8000. + const Repartition<int32_t, decltype(d)> dw; + const Vec64<int32_t> uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec64<int32_t> uL = ShiftRight<16>(BitCast(dw, lo)); + return Vec64<T>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +#else + const Repartition<uint32_t, decltype(d)> du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7}; + const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16)); + const Vec64<T> L = TableLookupBytes(lo, shuf); + const Vec64<T> H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +#endif +} + +// 32-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { + const RebindToFloat<decltype(d)> df; + return BitCast( + d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, + _MM_SHUFFLE(3, 1, 3, 1))}); +} +template <class D> +HWY_API Vec128<float> ConcatOdd(D /* d */, Vec128<float> hi, Vec128<float> lo) { + return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; +} + +// Any type x2 +template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// 8-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { + const Repartition<uint16_t, decltype(d)> dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec128<uint16_t> mask = Set(dw, 0x00FF); + const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask); + const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask); + return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)}; +} + +// 8-bit x8 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec64<T> ConcatEven(D d, Vec64<T> hi, Vec64<T> lo) { +#if HWY_TARGET == HWY_SSE2 + const Repartition<uint16_t, decltype(d)> dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec64<uint16_t> mask = Set(dw, 0x00FF); + const Vec64<uint16_t> uH = And(BitCast(dw, hi), mask); + const Vec64<uint16_t> uL = And(BitCast(dw, lo), mask); + return Vec64<T>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +#else + const Repartition<uint32_t, decltype(d)> du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6}; + const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8)); + const Vec64<T> L = TableLookupBytes(lo, shuf); + const Vec64<T> H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +#endif +} + +// 8-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec32<T> ConcatEven(D d, Vec32<T> hi, Vec32<T> lo) { +#if HWY_TARGET == HWY_SSE2 + const Repartition<uint16_t, decltype(d)> dw; + const Twice<decltype(dw)> dw_2; + // Isolate lower 8 bits per u16 so we can pack. + const Vec32<uint16_t> mask = Set(dw, 0x00FF); + const Vec32<uint16_t> uH = And(BitCast(dw, hi), mask); + const Vec32<uint16_t> uL = And(BitCast(dw, lo), mask); + const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL); + return Vec32<T>{_mm_packus_epi16(uHL.raw, uHL.raw)}; +#else + const Repartition<uint16_t, decltype(d)> du16; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2}; + const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8)); + const Vec32<T> L = TableLookupBytes(lo, shuf); + const Vec32<T> H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); +#endif +} + +// 16-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { +#if HWY_TARGET <= HWY_SSE4 + // Isolate lower 16 bits per u32 so we can pack. + const Repartition<uint32_t, decltype(d)> dw; + const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF); + const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask); + const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask); + return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)}; +#elif HWY_TARGET == HWY_SSE2 + const Repartition<uint32_t, decltype(d)> dw; + return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), + BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); +#else + // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two + // inputs, then concatenate them. + alignas(16) const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; + const Vec128<T> shuf = BitCast(d, Load(d, kCompactEvenU16)); + const Vec128<T> L = TableLookupBytes(lo, shuf); + const Vec128<T> H = TableLookupBytes(hi, shuf); + return ConcatLowerLower(d, H, L); +#endif +} + +// 16-bit x4 +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec64<T> ConcatEven(D d, Vec64<T> hi, Vec64<T> lo) { +#if HWY_TARGET == HWY_SSE2 + const Repartition<uint32_t, decltype(d)> dw; + return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), + BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); +#else + const Repartition<uint32_t, decltype(d)> du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5}; + const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16)); + const Vec64<T> L = TableLookupBytes(lo, shuf); + const Vec64<T> H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +#endif +} + +// 32-bit full +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { + const RebindToFloat<decltype(d)> df; + return BitCast( + d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, + _MM_SHUFFLE(2, 0, 2, 0))}); +} +template <class D> +HWY_API Vec128<float> ConcatEven(D /* d */, Vec128<float> hi, + Vec128<float> lo) { + return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; +} + +// Any T x2 +template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> +HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ DupEven (InterleaveLower) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { + return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} +template <size_t N> +HWY_API Vec128<float, N> DupEven(Vec128<float, N> v) { + return Vec128<float, N>{ + _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) { + return InterleaveLower(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { + return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} +template <size_t N> +HWY_API Vec128<float, N> DupOdd(Vec128<float, N> v) { + return Vec128<float, N>{ + _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { + return InterleaveUpper(DFromV<decltype(v)>(), v, v); +} + +// ------------------------------ TwoTablesLookupLanes (DupEven) + +template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> +HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, + Indices128<T, N> idx) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; +// TableLookupLanes currently requires table and index vectors to be the same +// size, though a half-length index vector would be sufficient here. +#if HWY_IS_MSAN + const Vec128<T, N> idx_vec{idx.raw}; + const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; +#else + // We only keep LowerHalf of the result, which is valid in idx. + const Indices128<T, N * 2> idx2{idx.raw}; +#endif + return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T> idx) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec128<T>{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)}; +#else // AVX3 or below + const DFromV<decltype(a)> d; + const Vec128<T> idx_vec{idx.raw}; + +#if HWY_TARGET <= HWY_SSE4 + const Repartition<uint16_t, decltype(d)> du16; + const auto sel_hi_mask = + MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); +#else + const RebindToSigned<decltype(d)> di; + const auto sel_hi_mask = + RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15})); +#endif + + const auto lo_lookup_result = TableLookupBytes(a, idx_vec); +#if HWY_TARGET <= HWY_AVX3 + const Vec128<T> lookup_result{_mm_mask_shuffle_epi8( + lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; + return lookup_result; +#else + const auto hi_lookup_result = TableLookupBytes(b, idx_vec); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#endif // HWY_TARGET <= HWY_AVX3 +#endif // HWY_TARGET <= HWY_AVX3_DL +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)}; +#elif HWY_TARGET == HWY_SSE2 + const DFromV<decltype(a)> d; + const RebindToSigned<decltype(d)> di; + const Vec128<T> idx_vec{idx.raw}; + const auto sel_hi_mask = + RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7})); + const auto lo_lookup_result = TableLookupLanes(a, idx); + const auto hi_lookup_result = TableLookupLanes(b, idx); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#else + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), + Indices128<uint8_t>{idx.raw})); +#endif +} + +template <typename T, HWY_IF_UI32(T)> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)}; +#else // AVX2 or below + const DFromV<decltype(a)> d; + +#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 + const Vec128<T> idx_vec{idx.raw}; + +#if HWY_TARGET <= HWY_AVX2 + const RebindToFloat<decltype(d)> d_sel; + const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec))); +#else + const RebindToSigned<decltype(d)> d_sel; + const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3}); +#endif + + const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx)); + const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx)); + return BitCast(d, + IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); +#else // SSSE3 or SSE4 + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), + Indices128<uint8_t>{idx.raw})); +#endif // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 +#endif // HWY_TARGET <= HWY_AVX3 +} + +HWY_API Vec128<float> TwoTablesLookupLanes(Vec128<float> a, Vec128<float> b, + Indices128<float> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<float>{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)}; +#elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 + const DFromV<decltype(a)> d; + +#if HWY_TARGET <= HWY_AVX2 + const auto sel_hi_mask = + MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128<int32_t>{idx.raw}))); +#else + const RebindToSigned<decltype(d)> di; + const auto sel_hi_mask = + RebindMask(d, Vec128<int32_t>{idx.raw} > Set(di, int32_t{3})); +#endif + + const auto lo_lookup_result = TableLookupLanes(a, idx); + const auto hi_lookup_result = TableLookupLanes(b, idx); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#else // SSSE3 or SSE4 + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), + Indices128<uint8_t>{idx.raw})); +#endif +} + +template <typename T, HWY_IF_UI64(T)> +HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, + Indices128<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const Vec128<T> idx_vec{idx.raw}; + const Indices128<T> idx_mod{And(idx_vec, Set(d, T{1})).raw}; + +#if HWY_TARGET <= HWY_SSE4 + const RebindToFloat<decltype(d)> d_sel; + const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec))); +#else // SSE2 or SSSE3 + const Repartition<int32_t, decltype(d)> di32; + const RebindToSigned<decltype(d)> d_sel; + const auto sel_hi_mask = MaskFromVec( + BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > + Set(di32, int32_t{1})))); +#endif // HWY_TARGET <= HWY_SSE4 + + const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod)); + const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod)); + return BitCast(d, + IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); +#endif // HWY_TARGET <= HWY_AVX3 +} + +HWY_API Vec128<double> TwoTablesLookupLanes(Vec128<double> a, Vec128<double> b, + Indices128<double> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<double>{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const RebindToSigned<decltype(d)> di; + const Vec128<int64_t> idx_vec{idx.raw}; + const Indices128<double> idx_mod{And(idx_vec, Set(di, int64_t{1})).raw}; + +#if HWY_TARGET <= HWY_SSE4 + const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec))); +#else // SSE2 or SSSE3 + const Repartition<int32_t, decltype(d)> di32; + const auto sel_hi_mask = + MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > + Set(di32, int32_t{1})))); +#endif // HWY_TARGET <= HWY_SSE4 + + const auto lo_lookup_result = TableLookupLanes(a, idx_mod); + const auto hi_lookup_result = TableLookupLanes(b, idx_mod); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#endif // HWY_TARGET <= HWY_AVX3 +} + +// ------------------------------ OddEven (IfThenElse) + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t mask[16] = { + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> d8; + alignas(16) static constexpr uint8_t mask[16] = { + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +#else + return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)}; +#endif +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); + const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); + return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)}; +#else + // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle. + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw, + BitCast(df, b).raw, 5)}); +#endif +} + +template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { + // Same as ConcatUpperLower for full vectors; do not call that because this + // is more efficient for 64x1 vectors. + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> dd; +#if HWY_TARGET >= HWY_SSSE3 + return BitCast( + d, Vec128<double, N>{_mm_shuffle_pd( + BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))}); +#else + // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. + return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw, + BitCast(dd, b).raw, 1)}); +#endif +} + +template <size_t N> +HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) { +#if HWY_TARGET >= HWY_SSSE3 + // SHUFPS must fill the lower half of the output from one input, so we + // need another shuffle. Unpack avoids another immediate byte. + const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); + const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); + return Vec128<float, N>{_mm_unpacklo_ps(even, odd)}; +#else + return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)}; +#endif +} + +// ------------------------------ OddEvenBlocks +template <typename T, size_t N> +HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template <typename T, size_t N> +HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { + return v; +} + +// ------------------------------ Shl (ZipLower, Mul) + +// Use AVX2/3 variable shifts where available, otherwise multiply by powers of +// two from loading float exponents, which is considerably faster (according +// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. + +namespace detail { +#if HWY_TARGET > HWY_AVX3 // Unused for AVX3 - we use sllv directly + +// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. +template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RepartitionToWide<decltype(d)> dw; + const Rebind<float, decltype(dw)> df; + const auto zero = Zero(d); + // Move into exponent (this u16 will become the upper half of an f32) + const auto exp = ShiftLeft<23 - 16>(v); + const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f + // Insert 0 into lower halves for reinterpreting as binary32. + const auto f0 = ZipLower(dw, zero, upper); + const auto f1 = ZipUpper(dw, zero, upper); + // See cvtps comment below. + const VFromD<decltype(dw)> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)}; + const VFromD<decltype(dw)> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)}; +#if HWY_TARGET <= HWY_SSE4 + return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)}; +#else + return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0)); +#endif +} + +// Same, for 32-bit shifts. +template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { + const DFromV<decltype(v)> d; + const auto exp = ShiftLeft<23>(v); + const auto f = exp + Set(d, 0x3F800000); // 1.0f + // Do not use ConvertTo because we rely on the native 0x80..00 overflow + // behavior. + return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))}; +} + +#endif // HWY_TARGET > HWY_AVX3 + +template <size_t N> +HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v, + Vec128<uint16_t, N> bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)}; +#else + return v * Pow2(bits); +#endif +} +HWY_API Vec128<uint16_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, 1> v, + Vec128<uint16_t, 1> bits) { + return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)}; +} + +// 8-bit: may use the Shl overload for uint16_t. +template <size_t N> +HWY_API Vec128<uint8_t, N> Shl(hwy::UnsignedTag tag, Vec128<uint8_t, N> v, + Vec128<uint8_t, N> bits) { + const DFromV<decltype(v)> d; +#if HWY_TARGET <= HWY_AVX3_DL + (void)tag; + // kMask[i] = 0xFF >> i + alignas(16) static constexpr uint8_t kMasks[16] = { + 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; + // kShl[i] = 1 << i + alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, + 0x20, 0x40, 0x80, 0x00}; + v = And(v, TableLookupBytes(Load(d, kMasks), bits)); + const VFromD<decltype(d)> mul = TableLookupBytes(Load(d, kShl), bits); + return VFromD<decltype(d)>{_mm_gf2p8mul_epi8(v.raw, mul.raw)}; +#else + const Repartition<uint16_t, decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + const VW mask = Set(dw, 0x00FF); + const VW vw = BitCast(dw, v); + const VW bits16 = BitCast(dw, bits); + const VW evens = Shl(tag, And(vw, mask), And(bits16, mask)); + // Shift odd lanes in-place + const VW odds = Shl(tag, vw, ShiftRight<8>(bits16)); + return BitCast(d, IfVecThenElse(Set(dw, 0xFF00), odds, evens)); +#endif +} +HWY_API Vec128<uint8_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint8_t, 1> v, + Vec128<uint8_t, 1> bits) { + const Full16<uint16_t> d16; + const Vec16<uint16_t> bits16{bits.raw}; + const Vec16<uint16_t> bits8 = And(bits16, Set(d16, 0xFF)); + return Vec128<uint8_t, 1>{_mm_sll_epi16(v.raw, bits8.raw)}; +} + +template <size_t N> +HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v, + Vec128<uint32_t, N> bits) { +#if HWY_TARGET >= HWY_SSE4 + return v * Pow2(bits); +#else + return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)}; +#endif +} +HWY_API Vec128<uint32_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, 1> v, + const Vec128<uint32_t, 1> bits) { + return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v, + Vec128<uint64_t> bits) { +#if HWY_TARGET >= HWY_SSE4 + const DFromV<decltype(v)> d; + // Individual shifts and combine + const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)}; + const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); + const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)}; + return ConcatUpperLower(d, out1, out0); +#else + return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)}; +#endif +} +HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v, + Vec64<uint64_t> bits) { + return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)}; +} + +// Signed left shift is the same as unsigned. +template <typename T, size_t N> +HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v, + Vec128<T, N> bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + return BitCast(di, + Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); +} + +} // namespace detail + +template <typename T, size_t N> +HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { + return detail::Shl(hwy::TypeTag<T>(), v, bits); +} + +// ------------------------------ Shr (mul, mask, BroadcastSignBit) + +// Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use +// widening multiplication by powers of two obtained by loading float exponents, +// followed by a constant right-shift. This is still faster than a scalar or +// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. + +template <size_t N> +HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> in, + const Vec128<uint16_t, N> bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)}; +#else + const DFromV<decltype(in)> d; + // For bits=0, we cannot mul by 2^16, so fix the result later. + const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); + // Replace output with input where bits == 0. + return IfThenElse(bits == Zero(d), in, out); +#endif +} +HWY_API Vec128<uint16_t, 1> operator>>(const Vec128<uint16_t, 1> in, + const Vec128<uint16_t, 1> bits) { + return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)}; +} + +// 8-bit uses 16-bit shifts. +template <size_t N> +HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> in, + const Vec128<uint8_t, N> bits) { + const DFromV<decltype(in)> d; + const Repartition<uint16_t, decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + const VW mask = Set(dw, 0x00FF); + const VW vw = BitCast(dw, in); + const VW bits16 = BitCast(dw, bits); + const VW evens = And(vw, mask) >> And(bits16, mask); + // Shift odd lanes in-place + const VW odds = vw >> ShiftRight<8>(bits16); + return OddEven(BitCast(d, odds), BitCast(d, evens)); +} +HWY_API Vec128<uint8_t, 1> operator>>(const Vec128<uint8_t, 1> in, + const Vec128<uint8_t, 1> bits) { + const Full16<uint16_t> d16; + const Vec16<uint16_t> bits16{bits.raw}; + const Vec16<uint16_t> bits8 = And(bits16, Set(d16, 0xFF)); + return Vec128<uint8_t, 1>{_mm_srl_epi16(in.raw, bits8.raw)}; +} + +template <size_t N> +HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in, + const Vec128<uint32_t, N> bits) { +#if HWY_TARGET >= HWY_SSE4 + // 32x32 -> 64 bit mul, then shift right by 32. + const DFromV<decltype(in)> d32; + // Move odd lanes into position for the second mul. Shuffle more gracefully + // handles N=1 than repartitioning to u64 and shifting 32 bits right. + const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)}; + // For bits=0, we cannot mul by 2^32, so fix the result later. + const auto mul = detail::Pow2(Set(d32, 32) - bits); + const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 + const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; + // No need to shift right, already in the correct position. + const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? + const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20)); + // Replace output with input where bits == 0. + return IfThenElse(bits == Zero(d32), in, out); +#else + return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)}; +#endif +} +HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in, + const Vec128<uint32_t, 1> bits) { + return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)}; +} + +HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v, + const Vec128<uint64_t> bits) { +#if HWY_TARGET >= HWY_SSE4 + const DFromV<decltype(v)> d; + // Individual shifts and combine + const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)}; + const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); + const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)}; + return ConcatUpperLower(d, out1, out0); +#else + return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)}; +#endif +} +HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v, + const Vec64<uint64_t> bits) { + return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)}; +} + +#if HWY_TARGET > HWY_AVX3 // AVX2 or older +namespace detail { + +// Also used in x86_256-inl.h. +template <class DI, class V> +HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { + const RebindToUnsigned<DI> du; + const auto count = BitCast(du, count_i); // same type as value to shift + // Clear sign and restore afterwards. This is preferable to shifting the MSB + // downwards because Shr is somewhat more expensive than Shl. + const auto sign = BroadcastSignBit(v); + const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below + return BitCast(di, abs >> count) ^ sign; +} + +} // namespace detail +#endif // HWY_TARGET > HWY_AVX3 + +template <size_t N> +HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v, + Vec128<int16_t, N> bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)}; +#else + const DFromV<decltype(v)> d; + return detail::SignedShr(d, v, bits); +#endif +} +HWY_API Vec128<int16_t, 1> operator>>(Vec128<int16_t, 1> v, + Vec128<int16_t, 1> bits) { + return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)}; +} + +template <size_t N> +HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v, + Vec128<int32_t, N> bits) { +#if HWY_TARGET <= HWY_AVX2 + return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)}; +#else + const DFromV<decltype(v)> d; + return detail::SignedShr(d, v, bits); +#endif +} +HWY_API Vec128<int32_t, 1> operator>>(Vec128<int32_t, 1> v, + Vec128<int32_t, 1> bits) { + return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)}; +} + +template <size_t N> +HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v, + Vec128<int64_t, N> bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)}; +#else + const DFromV<decltype(v)> d; + return detail::SignedShr(d, v, bits); +#endif +} + +// ------------------------------ MulEven/Odd 64x64 (UpperHalf) + +HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) { + const DFromV<decltype(a)> d; + alignas(16) uint64_t mul[2]; + mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); + return Load(d, mul); +} + +HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) { + const DFromV<decltype(a)> d; + const Half<decltype(d)> d2; + alignas(16) uint64_t mul[2]; + const uint64_t a1 = GetLane(UpperHalf(d2, a)); + const uint64_t b1 = GetLane(UpperHalf(d2, b)); + mul[0] = Mul128(a1, b1, &mul[1]); + return Load(d, mul); +} + +// ------------------------------ WidenMulPairwiseAdd + +// Generic for all vector lengths. +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { + // TODO(janwas): _mm_dpbf16_ps when available + const RebindToUnsigned<decltype(df32)> du32; + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip + // leads to the odd/even order that RearrangeToOddPlusEven prefers. + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), + Mul(BitCast(df32, ao), BitCast(df32, bo))); +} + +// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. +template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), + class V16 = VFromD<RepartitionToNarrow<D32>>> +HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { + return VFromD<D32>{_mm_madd_epi16(a.raw, b.raw)}; +} + + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ShiftLeft) + +// Generic for all vector lengths. +template <class D32, HWY_IF_F32_D(D32), + class V16 = VFromD<Repartition<bfloat16_t, D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, + const VFromD<D32> sum0, + VFromD<D32>& sum1) { + // TODO(janwas): _mm_dpbf16_ps when available + const RebindToUnsigned<decltype(df32)> du32; + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip + // leads to the odd/even order that RearrangeToOddPlusEven prefers. + using VU32 = VFromD<decltype(du32)>; + const VU32 odd = Set(du32, 0xFFFF0000u); + const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); + const VU32 ao = And(BitCast(du32, a), odd); + const VU32 be = ShiftLeft<16>(BitCast(du32, b)); + const VU32 bo = And(BitCast(du32, b), odd); + sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); + return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); +} + +// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. +template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), + class V16 = VFromD<RepartitionToNarrow<D32>>> +HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, + const VFromD<D32> sum0, + VFromD<D32>& /*sum1*/) { + (void)d; +#if HWY_TARGET <= HWY_AVX3_DL + return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; +#else + return sum0 + WidenMulPairwiseAdd(d, a, b); +#endif +} + +// ------------------------------ RearrangeToOddPlusEven +template <size_t N> +HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0, + Vec128<int32_t, N> /*sum1*/) { + return sum0; // invariant already holds +} + +template <class VW> +HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { + return Add(sum0, sum1); +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + const __m128i zero = _mm_setzero_si128(); + return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)}; +#else + return VFromD<D>{_mm_cvtepu8_epi16(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; +#else + return VFromD<D>{_mm_cvtepu16_epi32(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; +#else + return VFromD<D>{_mm_cvtepu32_epi64(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + const __m128i zero = _mm_setzero_si128(); + const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); + return VFromD<D>{_mm_unpacklo_epi16(u16, zero)}; +#else + return VFromD<D>{_mm_cvtepu8_epi32(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { +#if HWY_TARGET > HWY_SSSE3 + const Rebind<uint32_t, decltype(d)> du32; + return PromoteTo(d, PromoteTo(du32, v)); +#elif HWY_TARGET == HWY_SSSE3 + alignas(16) static constexpr int8_t kShuffle[16] = { + 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1}; + const Repartition<int8_t, decltype(d)> di8; + return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); +#else + (void)d; + return VFromD<D>{_mm_cvtepu8_epi64(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) { +#if HWY_TARGET > HWY_SSSE3 + const Rebind<uint32_t, decltype(d)> du32; + return PromoteTo(d, PromoteTo(du32, v)); +#elif HWY_TARGET == HWY_SSSE3 + alignas(16) static constexpr int8_t kShuffle[16] = { + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1}; + const Repartition<int8_t, decltype(d)> di8; + return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); +#else + (void)d; + return VFromD<D>{_mm_cvtepu16_epi64(v.raw)}; +#endif +} + +// Unsigned to signed: same plus cast. +template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V), + HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)), + HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> +HWY_API VFromD<D> PromoteTo(D di, V v) { + const RebindToUnsigned<decltype(di)> du; + return BitCast(di, PromoteTo(du, v)); +} + +// Signed: replicate sign bit. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)}); +#else + return VFromD<D>{_mm_cvtepi8_epi16(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)}); +#else + return VFromD<D>{_mm_cvtepi16_epi32(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)}); +#else + return VFromD<D>{_mm_cvtepi32_epi64(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); + const __m128i x4 = _mm_unpacklo_epi16(x2, x2); + return ShiftRight<24>(VFromD<D>{x4}); +#else + return VFromD<D>{_mm_cvtepi8_epi32(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + const Repartition<int32_t, decltype(d)> di32; + const Half<decltype(di32)> dh_i32; + const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw}; + const VFromD<decltype(di32)> s4{ + _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))}; + return ZipLower(d, x4, s4); +#else + (void)d; + return VFromD<D>{_mm_cvtepi8_epi64(v.raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> +HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + const Repartition<int32_t, decltype(d)> di32; + const Half<decltype(di32)> dh_i32; + const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw}; + const VFromD<decltype(di32)> s2{ + _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))}; + return ZipLower(d, x2, s2); +#else + (void)d; + return VFromD<D>{_mm_cvtepi16_epi64(v.raw)}; +#endif +} + +// Workaround for origin tracking bug in Clang msan prior to 11.0 +// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") +#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) +#define HWY_INLINE_F16 HWY_NOINLINE +#else +#define HWY_INLINE_F16 HWY_INLINE +#endif +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_INLINE_F16 VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) { +#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C) + const RebindToSigned<decltype(df32)> di32; + const RebindToUnsigned<decltype(df32)> du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, VFromD<Rebind<uint16_t, D>>{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +#else + (void)df32; + return VFromD<D>{_mm_cvtph_ps(v.raw)}; +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { + return VFromD<D>{_mm_cvtps_pd(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{_mm_cvtepi32_pd(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { +#if HWY_TARGET >= HWY_SSSE3 + const Rebind<int32_t, D> di32; + const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); + const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); + const auto clamped = Or(zero_if_neg, too_big); +#if HWY_TARGET == HWY_SSE2 + const Rebind<uint16_t, decltype(di32)> du16; + const RebindToSigned<decltype(du16)> di16; + return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); +#else + const Repartition<uint16_t, decltype(di32)> du16; + // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. + alignas(16) static constexpr uint16_t kLower2Bytes[16] = { + 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; + const auto lo2 = Load(du16, kLower2Bytes); + return VFromD<D>{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; +#endif +#else + return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)}; +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D du16, VFromD<Rebind<uint32_t, D>> v) { + const DFromV<decltype(v)> du32; + const RebindToSigned<decltype(du32)> di32; +#if HWY_TARGET >= HWY_SSSE3 + const auto too_big = + VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32))); + const auto clamped = Or(BitCast(di32, v), too_big); +#if HWY_TARGET == HWY_SSE2 + const RebindToSigned<decltype(du16)> di16; + return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); +#else + (void)du16; + const Repartition<uint16_t, decltype(di32)> du16_full; + // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. + alignas(16) static constexpr uint16_t kLower2Bytes[16] = { + 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; + const auto lo2 = Load(du16_full, kLower2Bytes); + return VFromD<D>{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw}; +#endif +#else + return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); + return VFromD<D>{_mm_packus_epi16(i16, i16)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); + return VFromD<D>{_mm_packs_epi16(i16, i16)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { + return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint32_t, D>> v) { +#if HWY_TARGET <= HWY_AVX3 + // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned + // integers to 8-bit unsigned integers + (void)du8; + return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)}; +#else + const DFromV<decltype(v)> du32; + const RebindToSigned<decltype(du32)> di32; + const auto max_i32 = Set(du32, 0x7FFFFFFFu); + +#if HWY_TARGET >= HWY_SSSE3 + // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation + // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. + + // The u8 Min operation below leaves the lower 24 bits of each 32-bit + // lane unchanged. + + // The u8 Min operation below will leave any values that are less than or + // equal to 0x7FFFFFFF unchanged. + + // For values that are greater than or equal to 0x80000000, the u8 Min + // operation below will force the upper 8 bits to 0x7F and leave the lower + // 24 bits unchanged. + + // An u8 Min operation is okay here as any clamped value that is greater than + // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and + // 0x7FFFFFFF through the u8 Min operation below, which will then be converted + // to 0xFF through the i32->u8 demotion. + const Repartition<uint8_t, decltype(du32)> du32_as_du8; + const auto clamped = BitCast( + di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32))); +#else + const auto clamped = BitCast(di32, Min(v, max_i32)); +#endif + + return DemoteTo(du8, clamped); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) { + const DFromV<decltype(v)> du16; + const RebindToSigned<decltype(du16)> di16; + const auto max_i16 = Set(du16, 0x7FFF); + +#if HWY_TARGET >= HWY_SSSE3 + // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation + // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. + + // The u8 Min operation below leaves the lower 8 bits of each 16-bit + // lane unchanged. + + // The u8 Min operation below will leave any values that are less than or + // equal to 0x7FFF unchanged. + + // For values that are greater than or equal to 0x8000, the u8 Min + // operation below will force the upper 8 bits to 0x7F and leave the lower + // 8 bits unchanged. + + // An u8 Min operation is okay here as any clamped value that is greater than + // or equal to 0x8000 will be clamped to a value between 0x7F00 and + // 0x7FFF through the u8 Min operation below, which will then be converted + // to 0xFF through the i16->u8 demotion. + const Repartition<uint8_t, decltype(du16)> du16_as_du8; + const auto clamped = BitCast( + di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16))); +#else + const auto clamped = BitCast(di16, Min(v, max_i16)); +#endif + + return DemoteTo(du8, clamped); +} + +// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate). +// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain") + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> +HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { +#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C) + const RebindToUnsigned<decltype(df16)> du16; + const Rebind<uint32_t, decltype(df16)> du; + const RebindToSigned<decltype(du)> di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return BitCast(df16, DemoteTo(du16, bits16)); +#else + (void)df16; + return VFromD<D>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; +#endif +} + +HWY_DIAGNOSTICS(pop) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)> +HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { + // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16. + const Rebind<int32_t, decltype(dbf16)> di32; + const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right + const Rebind<uint16_t, decltype(dbf16)> du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D), + class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) { + // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16. + const RebindToUnsigned<decltype(dbf16)> du16; + const Repartition<uint32_t, decltype(dbf16)> du32; + const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +// Specializations for partial vectors because packs_epi32 sets lanes above 2*N. +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec32<int16_t> ReorderDemote2To(D dn, Vec32<int32_t> a, + Vec32<int32_t> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> ReorderDemote2To(D /* tag */, Vec64<int32_t> a, + Vec64<int32_t> b) { + return Vec64<int16_t>{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, + Vec128<int32_t> b) { + return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> ReorderDemote2To(D dn, Vec32<int32_t> a, + Vec32<int32_t> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> ReorderDemote2To(D dn, Vec64<int32_t> a, + Vec64<int32_t> b) { +#if HWY_TARGET >= HWY_SSSE3 + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +#else + (void)dn; + return Vec64<uint16_t>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +#endif +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> ReorderDemote2To(D dn, Vec128<int32_t> a, + Vec128<int32_t> b) { +#if HWY_TARGET >= HWY_SSSE3 + const Half<decltype(dn)> dnh; + const auto u16_a = DemoteTo(dnh, a); + const auto u16_b = DemoteTo(dnh, b); + return Combine(dn, u16_b, u16_a); +#else + (void)dn; + return Vec128<uint16_t>{_mm_packus_epi32(a.raw, b.raw)}; +#endif +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a, + Vec128<uint32_t> b) { + const DFromV<decltype(a)> du32; + const RebindToSigned<decltype(du32)> di32; + const auto max_i32 = Set(du32, 0x7FFFFFFFu); + +#if HWY_TARGET >= HWY_SSSE3 + const Repartition<uint8_t, decltype(du32)> du32_as_du8; + // On SSE2/SSSE3, clamp a and b using u8 Min operation + const auto clamped_a = BitCast( + di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32))); + const auto clamped_b = BitCast( + di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32))); +#else + const auto clamped_a = BitCast(di32, Min(a, max_i32)); + const auto clamped_b = BitCast(di32, Min(b, max_i32)); +#endif + + return ReorderDemote2To(dn, clamped_a, clamped_b); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a, + VFromD<Repartition<uint32_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +// Specializations for partial vectors because packs_epi32 sets lanes above 2*N. +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, + VFromD<Repartition<int16_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> ReorderDemote2To(D /* tag */, Vec64<int16_t> a, + Vec64<int16_t> b) { + return Vec64<int8_t>{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, + Vec128<int16_t> b) { + return Vec128<int8_t>{_mm_packs_epi16(a.raw, b.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, + VFromD<Repartition<int16_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> ReorderDemote2To(D /* tag */, Vec64<int16_t> a, + Vec64<int16_t> b) { + return Vec64<uint8_t>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw), + _MM_SHUFFLE(2, 0, 2, 0))}; +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, + Vec128<int16_t> b) { + return Vec128<uint8_t>{_mm_packus_epi16(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a, + Vec128<uint16_t> b) { + const DFromV<decltype(a)> du16; + const RebindToSigned<decltype(du16)> di16; + const auto max_i16 = Set(du16, 0x7FFFu); + +#if HWY_TARGET >= HWY_SSSE3 + const Repartition<uint8_t, decltype(du16)> du16_as_du8; + // On SSE2/SSSE3, clamp a and b using u8 Min operation + const auto clamped_a = BitCast( + di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16))); + const auto clamped_b = BitCast( + di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16))); +#else + const auto clamped_a = BitCast(di16, Min(a, max_i16)); + const auto clamped_b = BitCast(di16, Min(b, max_i16)); +#endif + + return ReorderDemote2To(dn, clamped_a, clamped_b); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a, + VFromD<Repartition<uint16_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), + HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + return ReorderDemote2To(d, a, b); +} + +template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>> +HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) { + const RebindToUnsigned<decltype(dbf16)> du16; + return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { + return VFromD<D>{_mm_cvtpd_ps(v.raw)}; +} + +namespace detail { + +// For well-defined float->int demotion in all x86_*-inl.h. +template <class D> +HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) { + // The max can be exactly represented in binary64, so clamping beforehand + // prevents x86 conversion from raising an exception and returning 80..00. + return Min(v, Set(d, 2147483647.0)); +} + +// For ConvertTo float->int of same size, clamping before conversion would +// change the result because the max integer value is not exactly representable. +// Instead detect the overflow result after conversion and fix it. +template <class DI, class DF = RebindToFloat<DI>> +HWY_INLINE VFromD<DI> FixConversionOverflow( + DI di, VFromD<DF> original, decltype(Zero(DI()).raw) converted_raw) { + // Combinations of original and output sign: + // --: normal <0 or -huge_val to 80..00: OK + // -+: -0 to 0 : OK + // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF + // ++: normal >0 : OK + const VFromD<DI> converted{converted_raw}; + const VFromD<DI> sign_wrong = AndNot(BitCast(di, original), converted); +#if HWY_COMPILER_GCC_ACTUAL + // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also + // Add() if using that instead. Work around with one more instruction. + const RebindToUnsigned<DI> du; + const VFromD<DI> mask = BroadcastSignBit(sign_wrong); + const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask))); + return IfVecThenElse(mask, max, converted); +#else + return Xor(converted, BroadcastSignBit(sign_wrong)); +#endif +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D), + class DF = Rebind<double, D>> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<DF> v) { + const VFromD<DF> clamped = detail::ClampF64ToI32Max(DF(), v); + return VFromD<D>{_mm_cvttpd_epi32(clamped.raw)}; +} + +// For already range-limited input [0, 255]. +template <size_t N> +HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { +#if HWY_TARGET == HWY_SSE2 + const RebindToSigned<DFromV<decltype(v)>> di32; + const Rebind<uint8_t, decltype(di32)> du8; + return DemoteTo(du8, BitCast(di32, v)); +#else + const DFromV<decltype(v)> d32; + const Repartition<uint8_t, decltype(d32)> d8; + alignas(16) static constexpr uint32_t k8From32[4] = { + 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; + // Also replicate bytes into all 32 bit lanes for safety. + const auto quad = TableLookupBytes(v, Load(d32, k8From32)); + return LowerHalf(LowerHalf(BitCast(d8, quad))); +#endif +} + +// ------------------------------ MulFixedPoint15 + +#if HWY_TARGET == HWY_SSE2 +HWY_API Vec128<int16_t> MulFixedPoint15(const Vec128<int16_t> a, + const Vec128<int16_t> b) { + const DFromV<decltype(a)> d; + const Repartition<int32_t, decltype(d)> di32; + + auto lo_product = a * b; + auto hi_product = MulHigh(a, b); + + const VFromD<decltype(di32)> i32_product_lo{ + _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; + const VFromD<decltype(di32)> i32_product_hi{ + _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)}; + + const auto round_up_incr = Set(di32, 0x4000); + return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr), + ShiftRight<15>(i32_product_hi + round_up_incr)); +} + +template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> +HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + const DFromV<decltype(a)> d; + const Rebind<int32_t, decltype(d)> di32; + + const auto lo_product = a * b; + const auto hi_product = MulHigh(a, b); + const VFromD<decltype(di32)> i32_product{ + _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; + + return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000))); +} +#else +template <size_t N> +HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, + const Vec128<int16_t, N> b) { + return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)}; +} +#endif + +// ------------------------------ Truncations + +template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)> +HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) { + // BitCast requires the same size; DTo might be u8x1 and v u16x1. + const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto; + return VFromD<DTo>{BitCast(dto, v).raw}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec16<uint8_t> TruncateTo(D d, Vec128<uint64_t> v) { +#if HWY_TARGET == HWY_SSE2 + const Vec128<uint8_t, 1> lo{v.raw}; + const Vec128<uint8_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)}; + return Combine(d, hi, lo); +#else + const Repartition<uint8_t, DFromV<decltype(v)>> d8; + (void)d; + alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8, + 0, 8, 0, 8, 0, 8, 0, 8}; + const Vec128<uint8_t> v8 = TableLookupBytes(v, Load(d8, kIdx)); + return LowerHalf(LowerHalf(LowerHalf(v8))); +#endif +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec32<uint16_t> TruncateTo(D d, Vec128<uint64_t> v) { +#if HWY_TARGET == HWY_SSE2 + const Vec128<uint16_t, 1> lo{v.raw}; + const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)}; + return Combine(d, hi, lo); +#else + (void)d; + const Repartition<uint16_t, DFromV<decltype(v)>> d16; + alignas(16) static constexpr uint16_t kIdx[8] = { + 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u}; + const Vec128<uint16_t> v16 = TableLookupBytes(v, Load(d16, kIdx)); + return LowerHalf(LowerHalf(v16)); +#endif +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { + return Vec64<uint32_t>{_mm_shuffle_epi32(v.raw, 0x88)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const DFromV<decltype(v)> du32; +#if HWY_TARGET == HWY_SSE2 + const RebindToSigned<decltype(du32)> di32; + const Rebind<uint8_t, decltype(di32)> du8; + return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v)))); +#else + const Repartition<uint8_t, decltype(du32)> d; + alignas(16) static constexpr uint8_t kIdx[16] = { + 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, + 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu}; + return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx)))); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { + const DFromV<decltype(v)> du32; +#if HWY_TARGET == HWY_SSE2 + const RebindToSigned<decltype(du32)> di32; + const Rebind<uint16_t, decltype(di32)> du16; + const RebindToSigned<decltype(du16)> di16; + return BitCast( + du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v))))); +#else + const Repartition<uint16_t, decltype(du32)> d; + return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> +HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { + const DFromV<decltype(v)> du16; +#if HWY_TARGET == HWY_SSE2 + const RebindToSigned<decltype(du16)> di16; + const Rebind<uint8_t, decltype(di16)> du8; + const RebindToSigned<decltype(du8)> di8; + return BitCast(du8, + DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v))))); +#else + const Repartition<uint8_t, decltype(du16)> d; + return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); +#endif +} + +// ------------------------------ Demotions to/from i64 + +#if HWY_TARGET <= HWY_AVX3 +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { + return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { + return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { + return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { + return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { + return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)}; +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> +HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { + return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)}; +} +#else // AVX2 or below +namespace detail { +template <class D, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( + D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { + return v; +} + +template <class D, HWY_IF_SIGNED_D(D)> +HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( + D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { + const DFromV<decltype(v)> du64; + return And(v, + Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>()))); +} + +template <class D> +HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate( + D dn, VFromD<Rebind<uint64_t, D>> v) { + const Rebind<uint64_t, D> du64; + const RebindToSigned<decltype(du64)> di64; + constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) - + static_cast<int>(hwy::IsSigned<TFromD<D>>()); + + const auto too_big = BitCast( + du64, VecFromMask( + di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64)))); + return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class V> +HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) { + return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); +} + +} // namespace detail + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_SIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { + const DFromV<decltype(v)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const RebindToUnsigned<decltype(dn)> dn_u; + + // Negative values are saturated by first saturating their bitwise inverse + // and then inverting the saturation result + const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); + const auto saturated_vals = Xor( + invert_mask, + detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); + return BitCast(dn, TruncateTo(dn_u, saturated_vals)); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { + const DFromV<decltype(v)> di64; + const RebindToUnsigned<decltype(di64)> du64; + + const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); + return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); +} + +template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), + HWY_IF_UNSIGNED_D(D)> +HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { + return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); +} +#endif // HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), + HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a, + VFromD<Repartition<int64_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_U32_D(D)> +HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, + VFromD<Repartition<uint64_t, D>> b) { + const DFromV<decltype(a)> d; + const Twice<decltype(d)> dt; + return DemoteTo(dn, Combine(dt, b, a)); +} + +#if HWY_TARGET > HWY_AVX2 +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, + Vec128<int64_t> b) { + const DFromV<decltype(a)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const Half<decltype(dn)> dnh; + + // Negative values are saturated by first saturating their bitwise inverse + // and then inverting the saturation result + const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); + const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); + const auto saturated_a = Xor( + invert_mask_a, + detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); + const auto saturated_b = Xor( + invert_mask_b, + detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); + + return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, + Vec128<int64_t> b) { + const DFromV<decltype(a)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const Half<decltype(dn)> dnh; + + const auto saturated_a = detail::DemoteFromU64Saturate( + dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); + const auto saturated_b = detail::DemoteFromU64Saturate( + dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); + + return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<uint64_t> a, + Vec128<uint64_t> b) { + const Half<decltype(dn)> dnh; + + const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); + const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); + + return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); +} +#endif // HWY_TARGET > HWY_AVX2 + +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { + return VFromD<D>{_mm_cvtepi32_ps(v.raw)}; +} + +#if HWY_TARGET <= HWY_AVX3 +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D /*df*/, VFromD<Rebind<uint32_t, D>> v) { + return VFromD<D>{_mm_cvtepu32_ps(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<int64_t, D>> v) { + return VFromD<D>{_mm_cvtepi64_pd(v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> +HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<uint64_t, D>> v) { + return VFromD<D>{_mm_cvtepu64_pd(v.raw)}; +} +#else // AVX2 or below +template <class D, HWY_IF_F32_D(D)> +HWY_API VFromD<D> ConvertTo(D df, VFromD<Rebind<uint32_t, D>> v) { + // Based on wim's approach (https://stackoverflow.com/questions/34066228/) + const RebindToUnsigned<decltype(df)> du32; + const RebindToSigned<decltype(df)> d32; + + const auto msk_lo = Set(du32, 0xFFFF); + const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 + + // Extract the 16 lowest/highest significant bits of v and cast to signed int + const auto v_lo = BitCast(d32, And(v, msk_lo)); + const auto v_hi = BitCast(d32, ShiftRight<16>(v)); + return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) { + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const Repartition<uint32_t, decltype(dd)> d32; + const Repartition<uint64_t, decltype(dd)> d64; + + // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 + const auto k84_63 = Set(d64, 0x4530000080000000ULL); + const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); + + // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) + const auto k52 = Set(d32, 0x43300000); + const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); + + const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); + return (v_upper - k84_63_52) + v_lower; // order matters! +} + +namespace detail { +template <class VW> +HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) { + const DFromV<decltype(w)> d64; + const RebindToFloat<decltype(d64)> dd; + const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 + return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; +} +} // namespace detail + +template <class D, HWY_IF_F64_D(D)> +HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) { + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const RebindToUnsigned<decltype(dd)> d64; + using VU = VFromD<decltype(d64)>; + + const VU msk_lo = Set(d64, 0xFFFFFFFF); + const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 + + // Extract the 32 lowest/highest significant bits of v + const VU v_lo = And(v, msk_lo); + const VU v_hi = ShiftRight<32>(v); + + const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); + return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); +} +#endif // HWY_TARGET <= HWY_AVX3 + +// Truncates (rounds toward zero). +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> +HWY_API VFromD<D> ConvertTo(D di, VFromD<Rebind<float, D>> v) { + return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw)); +} + +#if HWY_TARGET <= HWY_AVX3 +template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> +HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { + return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw)); +} + +#else // AVX2 or below + +#if HWY_ARCH_X86_64 +template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)> +HWY_API VFromD<DI> ConvertTo(DI di, Vec64<double> v) { + const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))}; + return detail::FixConversionOverflow(di, v, i0.raw); +} +template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)> +HWY_API VFromD<DI> ConvertTo(DI di, Vec128<double> v) { + const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw)); + const Full64<double> dd2; + const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw)); + return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1)); +} +#endif // HWY_ARCH_X86_64 + +#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 +template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), + HWY_IF_I64_D(DI)> +HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { + using VI = VFromD<decltype(di)>; + const RebindToUnsigned<decltype(di)> du; + using VU = VFromD<decltype(du)>; + const Repartition<uint16_t, decltype(di)> du16; + const VI k1075 = Set(di, 1075); /* biased exponent of 2^52 */ + + // Exponent indicates whether the number can be represented as int64_t. + const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF); +#if HWY_TARGET <= HWY_SSE4 + const auto in_range = BitCast(di, biased_exp) < Set(di, 1086); +#else + const Repartition<int32_t, decltype(di)> di32; + const auto in_range = MaskFromVec(BitCast( + di, + VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086)))); +#endif + + // If we were to cap the exponent at 51 and add 2^52, the number would be in + // [2^52, 2^53) and mantissa bits could be read out directly. We need to + // round-to-0 (truncate), but changing rounding mode in MXCSR hits a + // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead + // manually shift the mantissa into place (we already have many of the + // inputs anyway). + + // Use 16-bit saturated unsigned subtraction to compute shift_mnt and + // shift_int since biased_exp[i] is a non-negative integer that is less than + // or equal to 2047. + + // 16-bit saturated unsigned subtraction is also more efficient than a + // 64-bit subtraction followed by a 64-bit signed Max operation on + // SSE2/SSSE3/SSE4/AVX2. + + // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be + // zero as the upper 48 bits of both k1075 and biased_exp are zero. + + const VU shift_mnt = BitCast( + du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); + const VU shift_int = BitCast( + du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); + const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1); + // Include implicit 1-bit + const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; + + // For inputs larger than 2^53 - 1, insert zeros at the bottom. + + // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be + // shifted out of the left shift result below as shift_int[i] <= 10 is true + // for any inputs that are less than 2^63. + + const VU shifted = int53 << shift_int; + + // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. + const VI sign_mask = BroadcastSignBit(BitCast(di, v)); + const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask; + const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit); + + // If the input was negative, negate the integer (two's complement). + return (magnitude ^ sign_mask) - sign_mask; +} +#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 +#endif // HWY_TARGET <= HWY_AVX3 + +template <size_t N> +HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw)); +} + +// ------------------------------ Floating-point rounding (ConvertTo) + +#if HWY_TARGET >= HWY_SSSE3 + +// Toward nearest integer, ties to even +template <typename T, size_t N> +HWY_API Vec128<T, N> Round(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + // Rely on rounding after addition with a large value such that no mantissa + // bits remain (assuming the current mode is nearest-even). We may need a + // compiler flag for precise floating-point to prevent "optimizing" this out. + const DFromV<decltype(v)> df; + const auto max = Set(df, MantissaEnd<T>()); + const auto large = CopySignToAbs(max, v); + const auto added = large + v; + const auto rounded = added - large; + // Keep original if NaN or the magnitude is large (already an int). + return IfThenElse(Abs(v) < max, rounded, v); +} + +namespace detail { + +// Truncating to integer and converting back to float is correct except when the +// input magnitude is large, in which case the input was already an integer +// (because mantissa >> exponent is zero). +template <typename T, size_t N> +HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + return Abs(v) < Set(d, MantissaEnd<T>()); +} + +} // namespace detail + +// Toward zero, aka truncate +template <typename T, size_t N> +HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); +} + +// Toward +infinity, aka ceiling +template <typename T, size_t N> +HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a positive non-integer ends up smaller; if so, add 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); + + return IfThenElse(detail::UseInt(v), int_f - neg1, v); +} + +// Toward -infinity, aka floor +template <typename T, size_t N> +HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> df; + const RebindToSigned<decltype(df)> di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a negative non-integer ends up larger; if so, subtract 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); + + return IfThenElse(detail::UseInt(v), int_f + neg1, v); +} + +#else + +// Toward nearest integer, ties to even +template <size_t N> +HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { + return Vec128<float, N>{ + _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} +template <size_t N> +HWY_API Vec128<double, N> Round(const Vec128<double, N> v) { + return Vec128<double, N>{ + _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} + +// Toward zero, aka truncate +template <size_t N> +HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { + return Vec128<float, N>{ + _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} +template <size_t N> +HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) { + return Vec128<double, N>{ + _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} + +// Toward +infinity, aka ceiling +template <size_t N> +HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { + return Vec128<float, N>{ + _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} +template <size_t N> +HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) { + return Vec128<double, N>{ + _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} + +// Toward -infinity, aka floor +template <size_t N> +HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { + return Vec128<float, N>{ + _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} +template <size_t N> +HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) { + return Vec128<double, N>{ + _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} + +#endif // !HWY_SSSE3 + +// ------------------------------ Floating-point classification + +template <size_t N> +HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x81)}; +#else + return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)}; +#endif +} +template <size_t N> +HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x81)}; +#else + return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)}; +#endif +} + +#if HWY_TARGET <= HWY_AVX3 + +template <size_t N> +HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) { + return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x18)}; +} +template <size_t N> +HWY_API Mask128<double, N> IsInf(const Vec128<double, N> v) { + return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x18)}; +} + +// Returns whether normal/subnormal/zero. +template <size_t N> +HWY_API Mask128<float, N> IsFinite(const Vec128<float, N> v) { + // fpclass doesn't have a flag for positive, so we have to check for inf/NaN + // and negate the mask. + return Not(Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x99)}); +} +template <size_t N> +HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) { + return Not(Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x99)}); +} + +#else + +template <typename T, size_t N> +HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>()))); +} + +// Returns whether normal/subnormal/zero. +template <typename T, size_t N> +HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + const VFromD<decltype(du)> vu = BitCast(du, v); + // Shift left to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). MSVC seems to generate + // incorrect code if we instead add vu + vu. + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)}; +} + +HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)}; +} + +HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { + return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)}; +} + +HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)}; +} + +HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, + Vec128<uint8_t> round_key) { + return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)}; +} + +template <uint8_t kRcon> +HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { + return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)}; +} + +template <size_t N> +HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; +} + +template <size_t N> +HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; +} + +#endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 + +// ================================================== MISC + +// ------------------------------ LoadMaskBits (TestBit) + +#if HWY_TARGET > HWY_AVX3 +namespace detail { + +template <class D, HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, kN=1. + const VFromD<D> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))}; + +#if HWY_TARGET == HWY_SSE2 + // {b0, b1, ...} ===> {b0, b0, b1, b1, ...} + __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw); + // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...} + unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits); + // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==> + // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1} + const VFromD<decltype(du)> rep8{ + _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)}; +#else + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); +#endif + + alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { + const RebindToUnsigned<decltype(d)> du; + alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); +} + +} // namespace detail +#endif // HWY_TARGET > HWY_AVX3 + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + constexpr size_t kN = MaxLanes(d); +#if HWY_TARGET <= HWY_AVX3 + (void)d; + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (kN + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + + return MFromD<D>::FromBits(mask_bits); +#else + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (kN + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + + return detail::LoadMaskBits128(d, mask_bits); +#endif +} + +template <typename T> +struct CompressIsPartition { +#if HWY_TARGET <= HWY_AVX3 + // AVX3 supports native compress, but a table-based approach allows + // 'partitioning' (also moving mask=false lanes to the top), which helps + // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8 + // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3 + // u32x8 etc.). + enum { value = (sizeof(T) == 8) }; +#else + // generic_ops-inl does not guarantee IsPartition for 8-bit. + enum { value = (sizeof(T) != 1) }; +#endif +}; + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ StoreMaskBits + +// `p` points to at least 8 writable bytes. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { + constexpr size_t kN = MaxLanes(d); + constexpr size_t kNumBytes = (kN + 7) / 8; + CopyBytes<kNumBytes>(&mask.raw, bits); + + // Non-full byte, need to clear the undefined upper bits. + if (kN < 8) { + const int mask_bits = (1 << kN) - 1; + bits[0] = static_cast<uint8_t>(bits[0] & mask_bits); + } + + return kNumBytes; +} + +// ------------------------------ Mask testing + +// Beware: the suffix indicates the number of mask bits, not lane size! + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t CountTrue(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); + return PopCount(mask_bits); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); + return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); + return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); + return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) + : -1; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API bool AllFalse(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); + return mask_bits == 0; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API bool AllTrue(D d, MFromD<D> mask) { + constexpr size_t kN = MaxLanes(d); + const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); + // Cannot use _kortestc because we may have less than 8 mask bits. + return mask_bits == (1ull << kN) - 1; +} + +// ------------------------------ Compress + +// 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512. + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)> +HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) { + return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { + HWY_DASSERT(mask.raw < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> d8; + const auto index = Load(d8, u8_indices + 16 * mask.raw); + return BitCast(d, TableLookupBytes(BitCast(d8, v), index)); +} + +// ------------------------------ CompressNot (Compress) + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { + // See CompressIsPartition, PrintCompressNot64x2NibbleTables + alignas(16) static constexpr uint64_t packed_array[16] = { + 0x00000010, 0x00000001, 0x00000010, 0x00000010}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2) - + // _mm_permutexvar_epi64 will ignore the upper bits. + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(16) static constexpr uint64_t shifts[2] = {0, 4}; + const auto indices = Indices128<T>{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, + Mask128<uint64_t> /* m */) { + return v; +} + +// ------------------------------ CompressStore (defined in x86_512) + +// ------------------------------ CompressBlendedStore (CompressStore) +template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, + T* HWY_RESTRICT unaligned) { + // AVX-512 already does the blending at no extra cost (latency 11, + // rthroughput 2 - same as compress plus store). + if (HWY_TARGET == HWY_AVX3_DL || + (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(T) > 2)) { + // We're relying on the mask to blend. Clear the undefined upper bits. + constexpr size_t kN = MaxLanes(d); + if (kN != 16 / sizeof(T)) { + m = And(m, FirstN(d, kN)); + } + return CompressStore(v, m, d, unaligned); + } else { + const size_t count = CountTrue(d, m); + const VFromD<D> compressed = Compress(v, m); +#if HWY_MEM_OPS_MIGHT_FAULT + // BlendedStore tests mask for each lane, but we know that the mask is + // FirstN, so we can just copy. + alignas(16) T buf[MaxLanes(d)]; + Store(compressed, d, buf); + memcpy(unaligned, buf, count * sizeof(T)); +#else + BlendedStore(compressed, FirstN(d, count), d, unaligned); +#endif + detail::MaybeUnpoison(unaligned, count); + return count; + } +} + +// ------------------------------ CompressBitsStore (defined in x86_512) + +#else // AVX2 or below + +// ------------------------------ StoreMaskBits + +namespace detail { + +constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { + return static_cast<uint64_t>(static_cast<unsigned>(mask_bits)); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128<T, N> mask) { + const Simd<T, N, 0> d; + const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; + return U64FromInt(_mm_movemask_epi8(sign_bits)); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128<T, N> mask) { + // Remove useless lower half of each u16 while preserving the sign bit. + const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); + return U64FromInt(_mm_movemask_epi8(sign_bits)); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) { + const Simd<T, N, 0> d; + const Simd<float, N, 0> df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)); + return U64FromInt(_mm_movemask_ps(sign_bits.raw)); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) { + const Simd<T, N, 0> d; + const Simd<double, N, 0> df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)); + return U64FromInt(_mm_movemask_pd(sign_bits.raw)); +} + +template <typename T, size_t N> +HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) { + return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask)); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { + constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; + const uint64_t mask_bits = detail::BitsFromMask(mask); + CopyBytes<kNumBytes>(&mask_bits, bits); + return kNumBytes; +} + +// ------------------------------ Mask testing + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) { + // Cheaper than PTEST, which is 2 uop / 3L. + return detail::BitsFromMask(mask) == 0; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API bool AllTrue(D d, MFromD<D> mask) { + constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; + return detail::BitsFromMask(mask) == kAllBits; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) { + return PopCount(detail::BitsFromMask(mask)); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) { + return Num0BitsBelowLS1Bit_Nonzero32( + static_cast<uint32_t>(detail::BitsFromMask(mask))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) { + const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) { + return 31 - Num0BitsAboveMS1Bit_Nonzero32( + static_cast<uint32_t>(detail::BitsFromMask(mask))); +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) { + const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) + : -1; +} + +// ------------------------------ Compress, CompressBits + +namespace detail { + +// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Rebind<uint8_t, decltype(d)> d8; + const Twice<decltype(d8)> d8t; + const RebindToUnsigned<decltype(d)> du; + + // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need + // byte indices for PSHUFB (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) static constexpr uint8_t table[2048] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; + const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 2)> +HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Rebind<uint8_t, decltype(d)> d8; + const Twice<decltype(d8)> d8t; + const RebindToUnsigned<decltype(d)> du; + + // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need + // byte indices for PSHUFB (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) static constexpr uint8_t table[2048] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; + const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[256] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4)> +HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[256] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[64] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <class D, HWY_IF_T_SIZE_D(D, 8)> +HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) static constexpr uint8_t u8_indices[64] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + + HWY_DASSERT(mask_bits < (1ull << N)); + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + + HWY_DASSERT(mask_bits < (1ull << N)); + const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); + return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); +} + +} // namespace detail + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const DFromV<decltype(v)> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case, 2 or 4 bytes +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { + return detail::CompressBits(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressNot + +// Single lane: no-op +template <typename T> +HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { + return v; +} + +// Two lanes: conditional swap +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const DFromV<decltype(v)> d; + const Vec128<T> m = VecFromMask(d, mask); + const Vec128<T> maskL = DupEven(m); + const Vec128<T> maskH = DupOdd(m); + const Vec128<T> swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> +HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNotBits(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, + Mask128<uint64_t> /* m */) { + return v; +} + +template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::CompressBits(v, mask_bits); +} + +// ------------------------------ CompressStore, CompressBitsStore + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + + const uint64_t mask_bits = detail::BitsFromMask(m); + HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); + const size_t count = PopCount(mask_bits); + + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + StoreU(compressed, d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + + const uint64_t mask_bits = detail::BitsFromMask(m); + HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); + const size_t count = PopCount(mask_bits); + + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + BlendedStore(compressed, FirstN(d, count), d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + + uint64_t mask_bits = 0; + constexpr size_t kN = MaxLanes(d); + constexpr size_t kNumBytes = (kN + 7) / 8; + CopyBytes<kNumBytes>(bits, &mask_bits); + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + const size_t count = PopCount(mask_bits); + + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). + const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + StoreU(compressed, d, unaligned); + + detail::MaybeUnpoison(unaligned, count); + return count; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Expand + +// Otherwise, use the generic_ops-inl.h fallback. +#if HWY_TARGET <= HWY_AVX3 || HWY_IDE + +// The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL), +// but we still want to override generic_ops-inl's table-based implementation +// whenever we have the 32-bit expand provided by AVX3. +#ifdef HWY_NATIVE_EXPAND +#undef HWY_NATIVE_EXPAND +#else +#define HWY_NATIVE_EXPAND +#endif + +namespace detail { + +#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 + +template <size_t N> +HWY_INLINE Vec128<uint8_t, N> NativeExpand(Vec128<uint8_t, N> v, + Mask128<uint8_t, N> mask) { + return Vec128<uint8_t, N>{_mm_maskz_expand_epi8(mask.raw, v.raw)}; +} + +template <size_t N> +HWY_INLINE Vec128<uint16_t, N> NativeExpand(Vec128<uint16_t, N> v, + Mask128<uint16_t, N> mask) { + return Vec128<uint16_t, N>{_mm_maskz_expand_epi16(mask.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> +HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, + const uint8_t* HWY_RESTRICT unaligned) { + return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> +HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, + const uint16_t* HWY_RESTRICT unaligned) { + return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)}; +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +template <size_t N> +HWY_INLINE Vec128<uint32_t, N> NativeExpand(Vec128<uint32_t, N> v, + Mask128<uint32_t, N> mask) { + return Vec128<uint32_t, N>{_mm_maskz_expand_epi32(mask.raw, v.raw)}; +} + +template <size_t N> +HWY_INLINE Vec128<uint64_t, N> NativeExpand(Vec128<uint64_t, N> v, + Mask128<uint64_t, N> mask) { + return Vec128<uint64_t, N>{_mm_maskz_expand_epi64(mask.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> +HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, + const uint32_t* HWY_RESTRICT unaligned) { + return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)}; +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> +HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, + const uint64_t* HWY_RESTRICT unaligned) { + return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)}; +} + +} // namespace detail + +// Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo. +#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 + +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> +HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +} + +// ------------------------------ LoadExpand + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); +#else + return Expand(LoadU(d, unaligned), mask); +#endif +} + +template <class D, HWY_IF_V_SIZE_LE_D(D, 16), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); +#else + return Expand(LoadU(d, unaligned), mask); +#endif +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ StoreInterleaved2/3/4 + +// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in +// generic_ops-inl.h. + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template <typename T> +HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE T ReduceSum(Vec128<T, 1> v) { + return GetLane(v); +} +template <typename T> +HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) { + return v; +} +template <typename T> +HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) { + return v10 + Shuffle2301(v10); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE T ReduceSum(Vec128<T, 2> v10) { + return GetLane(SumOfLanes(v10)); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) { + return Min(v10, Shuffle2301(v10)); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) { + return Max(v10, Shuffle2301(v10)); +} + +// N=4 (full) +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = v3210 + v1032; + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE T ReduceSum(Vec128<T> v3210) { + return GetLane(SumOfLanes(v3210)); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = Min(v3210, v1032); + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v3210) { + const Vec128<T> v1032 = Shuffle1032(v3210); + const Vec128<T> v31_20_31_20 = Max(v3210, v1032); + const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// u64/i64/f64: + +// N=2 (full) +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return v10 + v01; +} +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE T ReduceSum(Vec128<T> v10) { + return GetLane(SumOfLanes(v10)); +} +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return Min(v10, v01); +} +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) { + const Vec128<T> v01 = Shuffle01(v10); + return Max(v10, v01); +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_INLINE uint16_t ReduceSum(Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = ReduceSum(even + odd); + return static_cast<uint16_t>(sum); +} +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_INLINE Vec128<uint16_t, N> SumOfLanes(Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + return Set(d, ReduceSum(v)); +} +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_INLINE int16_t ReduceSum(Vec128<int16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = ReduceSum(even + odd); + return static_cast<int16_t>(sum); +} +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_INLINE Vec128<int16_t, N> SumOfLanes(Vec128<int16_t, N> v) { + const DFromV<decltype(v)> d; + return Set(d, ReduceSum(v)); +} +// u8, N=8, N=16: +HWY_INLINE uint8_t ReduceSum(Vec64<uint8_t> v) { + return static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF); +} +HWY_INLINE Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) { + const Full64<uint8_t> d; + return Set(d, ReduceSum(v)); +} +HWY_INLINE uint8_t ReduceSum(Vec128<uint8_t> v) { + uint64_t sums = ReduceSum(SumsOf8(v)); + return static_cast<uint8_t>(sums & 0xFF); +} +HWY_INLINE Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) { + const DFromV<decltype(v)> d; + return Set(d, ReduceSum(v)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)> +HWY_INLINE int8_t ReduceSum(const Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto is_neg = v < Zero(d); + + // Sum positive and negative lanes separately, then combine to get the result. + const auto positive = SumsOf8(BitCast(du, IfThenZeroElse(is_neg, v))); + const auto negative = SumsOf8(BitCast(du, IfThenElseZero(is_neg, Abs(v)))); + return static_cast<int8_t>(ReduceSum(positive - negative) & 0xFF); +} +template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)> +HWY_INLINE Vec128<int8_t, N> SumOfLanes(const Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + return Set(d, ReduceSum(v)); +} + +#if HWY_TARGET <= HWY_SSE4 +HWY_INLINE Vec128<uint16_t> MinOfLanes(Vec128<uint16_t> v) { + using V = decltype(v); + return Broadcast<0>(V{_mm_minpos_epu16(v.raw)}); +} +HWY_INLINE Vec64<uint8_t> MinOfLanes(Vec64<uint8_t> v) { + const DFromV<decltype(v)> d; + const Rebind<uint16_t, decltype(d)> d16; + return TruncateTo(d, MinOfLanes(PromoteTo(d16, v))); +} +HWY_INLINE Vec128<uint8_t> MinOfLanes(Vec128<uint8_t> v) { + const Half<DFromV<decltype(v)>> d; + Vec64<uint8_t> result = + Min(MinOfLanes(UpperHalf(d, v)), MinOfLanes(LowerHalf(d, v))); + return Combine(DFromV<decltype(v)>(), result, result); +} + +HWY_INLINE Vec128<uint16_t> MaxOfLanes(Vec128<uint16_t> v) { + const Vec128<uint16_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint16_t>())); + return m - MinOfLanes(m - v); +} +HWY_INLINE Vec64<uint8_t> MaxOfLanes(Vec64<uint8_t> v) { + const Vec64<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>())); + return m - MinOfLanes(m - v); +} +HWY_INLINE Vec128<uint8_t> MaxOfLanes(Vec128<uint8_t> v) { + const Vec128<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>())); + return m - MinOfLanes(m - v); +} +#elif HWY_TARGET >= HWY_SSSE3 +template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)> +HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d16; + const RepartitionToWide<decltype(d16)> d32; + Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v)); + vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); + vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); + if (N > 8) { + const RepartitionToWide<decltype(d32)> d64; + vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); + } + return vm; +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)> +HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d16; + const RepartitionToWide<decltype(d16)> d32; + Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v)); + vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); + vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); + if (N > 8) { + const RepartitionToWide<decltype(d32)> d64; + vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); + } + return vm; +} +#endif + +// Implement min/max of i8 in terms of u8 by toggling the sign bit. +template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)> +HWY_INLINE Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto mask = SignBit(du); + const auto vu = Xor(BitCast(du, v), mask); + return BitCast(d, Xor(MinOfLanes(vu), mask)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)> +HWY_INLINE Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto mask = SignBit(du); + const auto vu = Xor(BitCast(du, v), mask); + return BitCast(d, Xor(MaxOfLanes(vu), mask)); +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_INLINE Vec128<uint16_t, N> MinOfLanes(Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_INLINE Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)> +HWY_INLINE Vec128<uint16_t, N> MaxOfLanes(Vec128<uint16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)> +HWY_INLINE Vec128<int16_t, N> MaxOfLanes(Vec128<int16_t, N> v) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +} // namespace detail + +// Supported for u/i/f 32/64. Returns the same value in each lane. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) { + return detail::SumOfLanes(v); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) { + return detail::ReduceSum(v); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) { + return detail::MinOfLanes(v); +} +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) { + return detail::MaxOfLanes(v); +} + +// ------------------------------ Lt128 + +namespace detail { + +// Returns vector-mask for Lt128. Also used by x86_256/x86_512. +template <class D, class V = VFromD<D>> +HWY_INLINE V Lt128Vec(const D d, const V a, const V b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const auto eqHL = Eq(a, b); + const V ltHL = VecFromMask(d, Lt(a, b)); + const V ltLX = ShiftLeftLanes<1>(ltHL); + const V vecHx = IfThenElse(eqHL, ltLX, ltHL); + return InterleaveUpper(d, vecHx, vecHx); +} + +// Returns vector-mask for Eq128. Also used by x86_256/x86_512. +template <class D, class V = VFromD<D>> +HWY_INLINE V Eq128Vec(const D d, const V a, const V b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const auto eqHL = VecFromMask(d, Eq(a, b)); + const auto eqLH = Reverse2(d, eqHL); + return And(eqHL, eqLH); +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Ne128Vec(const D d, const V a, const V b) { + static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); + const auto neHL = VecFromMask(d, Ne(a, b)); + const auto neLH = Reverse2(d, neHL); + return Or(neHL, neLH); +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) { + // No specialization required for AVX-512: Mask <-> Vec is fast, and + // copying mask bits to their neighbor seems infeasible. + const V ltHL = VecFromMask(d, Lt(a, b)); + return InterleaveUpper(d, ltHL, ltHL); +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) { + // No specialization required for AVX-512: Mask <-> Vec is fast, and + // copying mask bits to their neighbor seems infeasible. + const V eqHL = VecFromMask(d, Eq(a, b)); + return InterleaveUpper(d, eqHL, eqHL); +} + +template <class D, class V = VFromD<D>> +HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) { + // No specialization required for AVX-512: Mask <-> Vec is fast, and + // copying mask bits to their neighbor seems infeasible. + const V neHL = VecFromMask(d, Ne(a, b)); + return InterleaveUpper(d, neHL, neHL); +} + +} // namespace detail + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Lt128(D d, const V a, const V b) { + return MaskFromVec(detail::Lt128Vec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Eq128(D d, const V a, const V b) { + return MaskFromVec(detail::Eq128Vec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Ne128(D d, const V a, const V b) { + return MaskFromVec(detail::Ne128Vec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) { + return MaskFromVec(detail::Lt128UpperVec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) { + return MaskFromVec(detail::Eq128UpperVec(d, a, b)); +} + +template <class D, class V = VFromD<D>> +HWY_API MFromD<D> Ne128Upper(D d, const V a, const V b) { + return MaskFromVec(detail::Ne128UpperVec(d, a, b)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Avoids the extra MaskFromVec in Lt128. +template <class D, class V = VFromD<D>> +HWY_API V Min128(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); +} + +template <class D, class V = VFromD<D>> +HWY_API V Max128(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); +} + +template <class D, class V = VFromD<D>> +HWY_API V Min128Upper(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); +} + +template <class D, class V = VFromD<D>> +HWY_API V Max128Upper(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); +} + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +#if HWY_TARGET <= HWY_AVX3 + +#ifdef HWY_NATIVE_LEADING_ZERO_COUNT +#undef HWY_NATIVE_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_LEADING_ZERO_COUNT +#endif + +template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> +HWY_API V LeadingZeroCount(V v) { + return V{_mm_lzcnt_epi32(v.raw)}; +} + +template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> +HWY_API V LeadingZeroCount(V v) { + return V{_mm_lzcnt_epi64(v.raw)}; +} + +// HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h +// for AVX3 targets + +#endif // HWY_TARGET <= HWY_AVX3 + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) diff --git a/third_party/highway/hwy/ops/x86_256-inl.h b/third_party/highway/hwy/ops/x86_256-inl.h new file mode 100644 index 0000000000..8350607db3 --- /dev/null +++ b/third_party/highway/hwy/ops/x86_256-inl.h @@ -0,0 +1,6476 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when +// compiling for that target. +// External include guard in highway.h - see comment there. + +// WARNING: most operations do not cross 128-bit block boundaries. In +// particular, "Broadcast", pack and zip behavior may be surprising. + +// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL +#include "hwy/base.h" + +// Avoid uninitialized warnings in GCC's avx512fintrin.h - see +// https://github.com/google/highway/issues/710) +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, + ignored "-Wmaybe-uninitialized") +#endif + +// Must come before HWY_COMPILER_CLANGCL +#include <immintrin.h> // AVX2+ + +#if HWY_COMPILER_CLANGCL +// Including <immintrin.h> should be enough, but Clang's headers helpfully skip +// including these headers when _MSC_VER is defined, like when using clang-cl. +// Include these directly here. +#include <avxintrin.h> +// avxintrin defines __m256i and must come before avx2intrin. +#include <avx2intrin.h> +#include <bmi2intrin.h> // _pext_u64 +#include <f16cintrin.h> +#include <fmaintrin.h> +#include <smmintrin.h> +#endif // HWY_COMPILER_CLANGCL + +#include <string.h> // memcpy + +// For half-width vectors. Already includes base.h. +#include "hwy/ops/shared-inl.h" +// Already included by shared-inl, but do it again to avoid IDE warnings. +#include "hwy/ops/x86_128-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +template <typename T> +struct Raw256 { + using type = __m256i; +}; +template <> +struct Raw256<float> { + using type = __m256; +}; +template <> +struct Raw256<double> { + using type = __m256d; +}; + +} // namespace detail + +template <typename T> +class Vec256 { + using Raw = typename detail::Raw256<T>::type; + + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec256& operator*=(const Vec256 other) { + return *this = (*this * other); + } + HWY_INLINE Vec256& operator/=(const Vec256 other) { + return *this = (*this / other); + } + HWY_INLINE Vec256& operator+=(const Vec256 other) { + return *this = (*this + other); + } + HWY_INLINE Vec256& operator-=(const Vec256 other) { + return *this = (*this - other); + } + HWY_INLINE Vec256& operator&=(const Vec256 other) { + return *this = (*this & other); + } + HWY_INLINE Vec256& operator|=(const Vec256 other) { + return *this = (*this | other); + } + HWY_INLINE Vec256& operator^=(const Vec256 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +#if HWY_TARGET <= HWY_AVX3 + +namespace detail { + +// Template arg: sizeof(lane type) +template <size_t size> +struct RawMask256 {}; +template <> +struct RawMask256<1> { + using type = __mmask32; +}; +template <> +struct RawMask256<2> { + using type = __mmask16; +}; +template <> +struct RawMask256<4> { + using type = __mmask8; +}; +template <> +struct RawMask256<8> { + using type = __mmask8; +}; + +} // namespace detail + +template <typename T> +struct Mask256 { + using Raw = typename detail::RawMask256<sizeof(T)>::type; + + static Mask256<T> FromBits(uint64_t mask_bits) { + return Mask256<T>{static_cast<Raw>(mask_bits)}; + } + + Raw raw; +}; + +#else // AVX2 + +// FF..FF or 0. +template <typename T> +struct Mask256 { + typename detail::Raw256<T>::type raw; +}; + +#endif // AVX2 + +#if HWY_TARGET <= HWY_AVX3 +namespace detail { + +// Used by Expand() emulation, which is required for both AVX3 and AVX2. +template <typename T> +HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) { + return mask.raw; +} + +} // namespace detail +#endif // HWY_TARGET <= HWY_AVX3 + +template <typename T> +using Full256 = Simd<T, 32 / sizeof(T), 0>; + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; } +HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); } +HWY_INLINE __m256i BitCastToInteger(__m256d v) { + return _mm256_castpd_si256(v); +} + +template <typename T> +HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) { + return Vec256<uint8_t>{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template <typename T> +struct BitCastFromInteger256 { + HWY_INLINE __m256i operator()(__m256i v) { return v; } +}; +template <> +struct BitCastFromInteger256<float> { + HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); } +}; +template <> +struct BitCastFromInteger256<double> { + HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); } +}; + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec256<T> BitCastFromByte(D /* tag */, Vec256<uint8_t> v) { + return Vec256<T>{BitCastFromInteger256<T>()(v.raw)}; +} + +} // namespace detail + +template <class D, typename FromT> +HWY_API Vec256<TFromD<D>> BitCast(D d, Vec256<FromT> v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Zero + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_D(D)> +HWY_API Vec256<TFromD<D>> Zero(D /* tag */) { + return Vec256<TFromD<D>>{_mm256_setzero_si256()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> +HWY_API Vec256<float> Zero(D /* tag */) { + return Vec256<float>{_mm256_setzero_ps()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> +HWY_API Vec256<double> Zero(D /* tag */) { + return Vec256<double>{_mm256_setzero_pd()}; +} + +// ------------------------------ Set + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm256_set1_epi32(static_cast<int>(t))}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> +HWY_API Vec256<float> Set(D /* tag */, float t) { + return Vec256<float>{_mm256_set1_ps(t)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> +HWY_API Vec256<double> Set(D /* tag */, double t) { + return Vec256<double>{_mm256_set1_pd(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_D(D)> +HWY_API Vec256<TFromD<D>> Undefined(D /* tag */) { + // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC + // generate an XOR instruction. + return Vec256<TFromD<D>>{_mm256_undefined_si256()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> +HWY_API Vec256<float> Undefined(D /* tag */) { + return Vec256<float>{_mm256_undefined_ps()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> +HWY_API Vec256<double> Undefined(D /* tag */) { + return Vec256<double>{_mm256_undefined_pd()}; +} + +HWY_DIAGNOSTICS(pop) + +// ------------------------------ ResizeBitCast + +// 32-byte vector to 32-byte vector (or 64-byte vector to 64-byte vector on +// AVX3) +template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16), + HWY_IF_V_SIZE_D(D, HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>))> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, v); +} + +// 32-byte vector to 16-byte vector (or 64-byte vector to 32-byte vector on +// AVX3) +template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16), + HWY_IF_V_SIZE_D(D, + (HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>)) / 2)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + const DFromV<decltype(v)> d_from; + const Half<decltype(d_from)> dh_from; + return BitCast(d, LowerHalf(dh_from, v)); +} + +// 32-byte vector (or 64-byte vector on AVX3) to <= 8-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16), + HWY_IF_V_SIZE_LE_D(D, 8)> +HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) { + return VFromD<D>{ResizeBitCast(Full128<TFromD<D>>(), v).raw}; +} + +// <= 16-byte vector to 32-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), + HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, Vec256<uint8_t>{_mm256_castsi128_si256( + ResizeBitCast(Full128<uint8_t>(), v).raw)}); +} + +// ================================================== LOGICAL + +// ------------------------------ And + +template <typename T> +HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) { + return Vec256<T>{_mm256_and_si256(a.raw, b.raw)}; +} + +HWY_API Vec256<float> And(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_and_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> And(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_and_pd(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template <typename T> +HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) { + return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)}; +} +HWY_API Vec256<float> AndNot(Vec256<float> not_mask, Vec256<float> mask) { + return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)}; +} +HWY_API Vec256<double> AndNot(Vec256<double> not_mask, Vec256<double> mask) { + return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)}; +} + +// ------------------------------ Or + +template <typename T> +HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) { + return Vec256<T>{_mm256_or_si256(a.raw, b.raw)}; +} + +HWY_API Vec256<float> Or(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_or_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> Or(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_or_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template <typename T> +HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) { + return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)}; +} + +HWY_API Vec256<float> Xor(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> Xor(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)}; +} + +// ------------------------------ Not +template <typename T> +HWY_API Vec256<T> Not(const Vec256<T> v) { + const DFromV<decltype(v)> d; + using TU = MakeUnsigned<T>; +#if HWY_TARGET <= HWY_AVX3 + const __m256i vu = BitCast(RebindToUnsigned<decltype(d)>(), v).raw; + return BitCast(d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)}); +#else + return Xor(v, BitCast(d, Vec256<TU>{_mm256_set1_epi32(-1)})); +#endif +} + +// ------------------------------ Xor3 +template <typename T> +HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(x1)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m256i ret = _mm256_ternarylogic_epi64( + BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); + return BitCast(d, VU{ret}); +#else + return Xor(x1, Xor(x2, x3)); +#endif +} + +// ------------------------------ Or3 +template <typename T> +HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(o1)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m256i ret = _mm256_ternarylogic_epi64( + BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); + return BitCast(d, VU{ret}); +#else + return Or(o1, Or(o2, o3)); +#endif +} + +// ------------------------------ OrAnd +template <typename T> +HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(o)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m256i ret = _mm256_ternarylogic_epi64( + BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); + return BitCast(d, VU{ret}); +#else + return Or(o, And(a1, a2)); +#endif +} + +// ------------------------------ IfVecThenElse +template <typename T> +HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV<decltype(yes)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw, + BitCast(du, yes).raw, + BitCast(du, no).raw, 0xCA)}); +#else + return IfThenElse(MaskFromVec(mask), yes, no); +#endif +} + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T> +HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) { + return And(a, b); +} + +template <typename T> +HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) { + return Or(a, b); +} + +template <typename T> +HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +// 8/16 require BITALG, 32/64 require VPOPCNTDQ. +#if HWY_TARGET <= HWY_AVX3_DL + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template <typename T> +HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec256<T> v) { + return Vec256<T>{_mm256_popcnt_epi8(v.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec256<T> v) { + return Vec256<T>{_mm256_popcnt_epi16(v.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec256<T> v) { + return Vec256<T>{_mm256_popcnt_epi32(v.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec256<T> v) { + return Vec256<T>{_mm256_popcnt_epi64(v.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> PopulationCount(Vec256<T> v) { + return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ================================================== SIGN + +// ------------------------------ CopySign + +template <typename T> +HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + + const DFromV<decltype(magn)> d; + const auto msb = SignBit(d); + +#if HWY_TARGET <= HWY_AVX3 + const Rebind<MakeUnsigned<T>, decltype(d)> du; + // Truth table for msb, magn, sign | bitwise msb ? sign : mag + // 0 0 0 | 0 + // 0 0 1 | 0 + // 0 1 0 | 1 + // 0 1 1 | 1 + // 1 0 0 | 0 + // 1 0 1 | 1 + // 1 1 0 | 0 + // 1 1 1 | 1 + // The lane size does not matter because we are not using predication. + const __m256i out = _mm256_ternarylogic_epi32( + BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); + return BitCast(d, decltype(Zero(du)){out}); +#else + return Or(AndNot(msb, magn), And(msb, sign)); +#endif +} + +template <typename T> +HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) { +#if HWY_TARGET <= HWY_AVX3 + // AVX3 can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +#else + const DFromV<decltype(abs)> d; + return Or(abs, And(SignBit(d), sign)); +#endif +} + +// ================================================== MASK + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ IfThenElse + +// Returns mask ? b : a. + +namespace detail { + +// Templates for signed/unsigned integer of a particular size. +template <typename T> +HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<1> /* tag */, Mask256<T> mask, + Vec256<T> yes, Vec256<T> no) { + return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<2> /* tag */, Mask256<T> mask, + Vec256<T> yes, Vec256<T> no) { + return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<4> /* tag */, Mask256<T> mask, + Vec256<T> yes, Vec256<T> no) { + return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask, + Vec256<T> yes, Vec256<T> no) { + return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) { + return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no); +} +HWY_API Vec256<float> IfThenElse(Mask256<float> mask, Vec256<float> yes, + Vec256<float> no) { + return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)}; +} +HWY_API Vec256<double> IfThenElse(Mask256<double> mask, Vec256<double> yes, + Vec256<double> no) { + return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)}; +} + +namespace detail { + +template <typename T> +HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<1> /* tag */, Mask256<T> mask, + Vec256<T> yes) { + return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<2> /* tag */, Mask256<T> mask, + Vec256<T> yes) { + return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<4> /* tag */, Mask256<T> mask, + Vec256<T> yes) { + return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256<T> mask, + Vec256<T> yes) { + return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) { + return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes); +} +HWY_API Vec256<float> IfThenElseZero(Mask256<float> mask, Vec256<float> yes) { + return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)}; +} +HWY_API Vec256<double> IfThenElseZero(Mask256<double> mask, + Vec256<double> yes) { + return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)}; +} + +namespace detail { + +template <typename T> +HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<1> /* tag */, Mask256<T> mask, + Vec256<T> no) { + // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. + return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<2> /* tag */, Mask256<T> mask, + Vec256<T> no) { + return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<4> /* tag */, Mask256<T> mask, + Vec256<T> no) { + return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T> +HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask, + Vec256<T> no) { + return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) { + return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no); +} +HWY_API Vec256<float> IfThenZeroElse(Mask256<float> mask, Vec256<float> no) { + return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; +} +HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) { + return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; +} + +template <typename T> +HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) { + static_assert(IsSigned<T>(), "Only for float"); + // AVX3 MaskFromVec only looks at the MSB + return IfThenZeroElse(MaskFromVec(v), v); +} + +// ------------------------------ Mask logical + +namespace detail { + +template <typename T> +HWY_INLINE Mask256<T> And(hwy::SizeTag<1> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kand_mask32(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> And(hwy::SizeTag<2> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kand_mask16(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> And(hwy::SizeTag<4> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kand_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> And(hwy::SizeTag<8> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kand_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<1> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kandn_mask32(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<2> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kandn_mask16(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<4> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<8> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask256<T> Or(hwy::SizeTag<1> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kor_mask32(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> Or(hwy::SizeTag<2> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kor_mask16(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> Or(hwy::SizeTag<4> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kor_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> Or(hwy::SizeTag<8> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kor_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask256<T> Xor(hwy::SizeTag<1> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxor_mask32(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> Xor(hwy::SizeTag<2> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxor_mask16(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> Xor(hwy::SizeTag<4> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask256<T> a, + const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/, + const Mask256<T> a, const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxnor_mask32(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/, + const Mask256<T> a, const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxnor_mask16(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/, + const Mask256<T> a, const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{_kxnor_mask8(a.raw, b.raw)}; +#else + return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; +#endif +} +template <typename T> +HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/, + const Mask256<T> a, const Mask256<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; +#else + return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; +#endif +} + +} // namespace detail + +template <typename T> +HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) { + return detail::And(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) { + return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) { + return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) { + return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask256<T> Not(const Mask256<T> m) { + // Flip only the valid bits. + constexpr size_t N = 32 / sizeof(T); + return Xor(m, Mask256<T>::FromBits((1ull << N) - 1)); +} + +template <typename T> +HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) { + return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b); +} + +#else // AVX2 + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template <typename T> +HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) { + return Mask256<T>{v.raw}; +} + +template <typename T> +HWY_API Vec256<T> VecFromMask(const Mask256<T> v) { + return Vec256<T>{v.raw}; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> VecFromMask(D /* tag */, const Mask256<T> v) { + return Vec256<T>{v.raw}; +} + +// ------------------------------ IfThenElse + +// mask ? yes : no +template <typename T> +HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes, + const Vec256<T> no) { + return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)}; +} +HWY_API Vec256<float> IfThenElse(const Mask256<float> mask, + const Vec256<float> yes, + const Vec256<float> no) { + return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)}; +} +HWY_API Vec256<double> IfThenElse(const Mask256<double> mask, + const Vec256<double> yes, + const Vec256<double> no) { + return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)}; +} + +// mask ? yes : 0 +template <typename T> +HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) { + const DFromV<decltype(yes)> d; + return yes & VecFromMask(d, mask); +} + +// mask ? 0 : no +template <typename T> +HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) { + const DFromV<decltype(no)> d; + return AndNot(VecFromMask(d, mask), no); +} + +template <typename T> +HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) { + static_assert(IsSigned<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const auto zero = Zero(d); + // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes + return IfThenElse(MaskFromVec(v), zero, v); +} + +// ------------------------------ Mask logical + +template <typename T> +HWY_API Mask256<T> Not(const Mask256<T> m) { + const Full256<T> d; + return MaskFromVec(Not(VecFromMask(d, m))); +} + +template <typename T> +HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template <typename T> +HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) { + const Full256<T> d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ================================================== COMPARE + +#if HWY_TARGET <= HWY_AVX3 + +// Comparisons set a mask bit to 1 if the condition is true, else 0. + +template <typename TFrom, class DTo, typename TTo = TFromD<DTo>> +HWY_API Mask256<TTo> RebindMask(DTo /*tag*/, Mask256<TFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask256<TTo>{m.raw}; +} + +namespace detail { + +template <typename T> +HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<1> /*tag*/, const Vec256<T> v, + const Vec256<T> bit) { + return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)}; +} +template <typename T> +HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<2> /*tag*/, const Vec256<T> v, + const Vec256<T> bit) { + return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)}; +} +template <typename T> +HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<4> /*tag*/, const Vec256<T> v, + const Vec256<T> bit) { + return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)}; +} +template <typename T> +HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<8> /*tag*/, const Vec256<T> v, + const Vec256<T> bit) { + return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit); +} + +// ------------------------------ Equality + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI32(T)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI64(T)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +HWY_API Mask256<double> operator==(Vec256<double> a, Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI32(T)> +HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI64(T)> +HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +HWY_API Mask256<double> operator!=(Vec256<double> a, Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) { + return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)}; +} +HWY_API Mask256<int16_t> operator>(Vec256<int16_t> a, Vec256<int16_t> b) { + return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)}; +} +HWY_API Mask256<int32_t> operator>(Vec256<int32_t> a, Vec256<int32_t> b) { + return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)}; +} +HWY_API Mask256<int64_t> operator>(Vec256<int64_t> a, Vec256<int64_t> b) { + return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256<uint8_t> operator>(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)}; +} +HWY_API Mask256<uint16_t> operator>(const Vec256<uint16_t> a, + const Vec256<uint16_t> b) { + return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)}; +} +HWY_API Mask256<uint32_t> operator>(const Vec256<uint32_t> a, + const Vec256<uint32_t> b) { + return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)}; +} +HWY_API Mask256<uint64_t> operator>(const Vec256<uint64_t> a, + const Vec256<uint64_t> b) { + return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} +HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} + +// ------------------------------ Weak inequality + +HWY_API Mask256<float> operator>=(Vec256<float> a, Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} +HWY_API Mask256<double> operator>=(Vec256<double> a, Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} + +HWY_API Mask256<int8_t> operator>=(Vec256<int8_t> a, Vec256<int8_t> b) { + return Mask256<int8_t>{_mm256_cmpge_epi8_mask(a.raw, b.raw)}; +} +HWY_API Mask256<int16_t> operator>=(Vec256<int16_t> a, Vec256<int16_t> b) { + return Mask256<int16_t>{_mm256_cmpge_epi16_mask(a.raw, b.raw)}; +} +HWY_API Mask256<int32_t> operator>=(Vec256<int32_t> a, Vec256<int32_t> b) { + return Mask256<int32_t>{_mm256_cmpge_epi32_mask(a.raw, b.raw)}; +} +HWY_API Mask256<int64_t> operator>=(Vec256<int64_t> a, Vec256<int64_t> b) { + return Mask256<int64_t>{_mm256_cmpge_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256<uint8_t> operator>=(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Mask256<uint8_t>{_mm256_cmpge_epu8_mask(a.raw, b.raw)}; +} +HWY_API Mask256<uint16_t> operator>=(const Vec256<uint16_t> a, + const Vec256<uint16_t> b) { + return Mask256<uint16_t>{_mm256_cmpge_epu16_mask(a.raw, b.raw)}; +} +HWY_API Mask256<uint32_t> operator>=(const Vec256<uint32_t> a, + const Vec256<uint32_t> b) { + return Mask256<uint32_t>{_mm256_cmpge_epu32_mask(a.raw, b.raw)}; +} +HWY_API Mask256<uint64_t> operator>=(const Vec256<uint64_t> a, + const Vec256<uint64_t> b) { + return Mask256<uint64_t>{_mm256_cmpge_epu64_mask(a.raw, b.raw)}; +} + +// ------------------------------ Mask + +namespace detail { + +template <typename T> +HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec256<T> v) { + return Mask256<T>{_mm256_movepi8_mask(v.raw)}; +} +template <typename T> +HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec256<T> v) { + return Mask256<T>{_mm256_movepi16_mask(v.raw)}; +} +template <typename T> +HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec256<T> v) { + return Mask256<T>{_mm256_movepi32_mask(v.raw)}; +} +template <typename T> +HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec256<T> v) { + return Mask256<T>{_mm256_movepi64_mask(v.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) { + return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v); +} +// There do not seem to be native floating-point versions of these instructions. +HWY_API Mask256<float> MaskFromVec(const Vec256<float> v) { + const Full256<int32_t> di; + return Mask256<float>{MaskFromVec(BitCast(di, v)).raw}; +} +HWY_API Mask256<double> MaskFromVec(const Vec256<double> v) { + const Full256<int64_t> di; + return Mask256<double>{MaskFromVec(BitCast(di, v)).raw}; +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> VecFromMask(const Mask256<T> v) { + return Vec256<T>{_mm256_movm_epi8(v.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> VecFromMask(const Mask256<T> v) { + return Vec256<T>{_mm256_movm_epi16(v.raw)}; +} + +template <typename T, HWY_IF_UI32(T)> +HWY_API Vec256<T> VecFromMask(const Mask256<T> v) { + return Vec256<T>{_mm256_movm_epi32(v.raw)}; +} + +template <typename T, HWY_IF_UI64(T)> +HWY_API Vec256<T> VecFromMask(const Mask256<T> v) { + return Vec256<T>{_mm256_movm_epi64(v.raw)}; +} + +HWY_API Vec256<float> VecFromMask(const Mask256<float> v) { + return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))}; +} + +HWY_API Vec256<double> VecFromMask(const Mask256<double> v) { + return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))}; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> VecFromMask(D /* tag */, const Mask256<T> v) { + return VecFromMask(v); +} + +#else // AVX2 + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template <typename TFrom, class DTo, typename TTo = TFromD<DTo>> +HWY_API Mask256<TTo> RebindMask(DTo d_to, Mask256<TFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + const Full256<TFrom> dfrom; + return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m))); +} + +template <typename T> +HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)}; +} + +template <typename T, HWY_IF_UI32(T)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)}; +} + +template <typename T, HWY_IF_UI64(T)> +HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) { + return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)}; +} + +HWY_API Mask256<float> operator==(const Vec256<float> a, + const Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +HWY_API Mask256<double> operator==(const Vec256<double> a, + const Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template <typename T> +HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) { + return Not(a == b); +} +HWY_API Mask256<float> operator!=(const Vec256<float> a, + const Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)}; +} +HWY_API Mask256<double> operator!=(const Vec256<double> a, + const Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8 +// to perform an unsigned comparison instead of the intended signed. Workaround +// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy +#if HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 903 +#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1 +#else +#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0 +#endif + +HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a, + Vec256<int8_t> b) { +#if HWY_AVX2_GCC_CMPGT8_WORKAROUND + using i8x32 = signed char __attribute__((__vector_size__(32))); + return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) > + reinterpret_cast<i8x32>(b.raw))}; +#else + return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)}; +#endif +} +HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a, + Vec256<int16_t> b) { + return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)}; +} +HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a, + Vec256<int32_t> b) { + return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)}; +} +HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a, + Vec256<int64_t> b) { + return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)}; +} + +template <typename T> +HWY_INLINE Mask256<T> Gt(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) { + const Full256<T> du; + const RebindToSigned<decltype(du)> di; + const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1); + return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb))); +} + +HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a, + Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)}; +} +HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a, + Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)}; +} + +} // namespace detail + +template <typename T> +HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) { + return detail::Gt(hwy::TypeTag<T>(), a, b); +} + +// ------------------------------ Weak inequality + +namespace detail { + +template <typename T> +HWY_INLINE Mask256<T> Ge(hwy::SignedTag tag, Vec256<T> a, Vec256<T> b) { + return Not(Gt(tag, b, a)); +} + +template <typename T> +HWY_INLINE Mask256<T> Ge(hwy::UnsignedTag tag, Vec256<T> a, Vec256<T> b) { + return Not(Gt(tag, b, a)); +} + +HWY_INLINE Mask256<float> Ge(hwy::FloatTag /*tag*/, Vec256<float> a, + Vec256<float> b) { + return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)}; +} +HWY_INLINE Mask256<double> Ge(hwy::FloatTag /*tag*/, Vec256<double> a, + Vec256<double> b) { + return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)}; +} + +} // namespace detail + +template <typename T> +HWY_API Mask256<T> operator>=(Vec256<T> a, Vec256<T> b) { + return detail::Ge(hwy::TypeTag<T>(), a, b); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Reversed comparisons + +template <typename T> +HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) { + return b > a; +} + +template <typename T> +HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) { + return b >= a; +} + +// ------------------------------ Min (Gt, IfThenElse) + +// Unsigned +HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a, + const Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a, + const Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a, + const Vec256<uint64_t> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)}; +#else + const Full256<uint64_t> du; + const Full256<int64_t> di; + const auto msb = Set(du, 1ull << 63); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, b, a); +#endif +} + +// Signed +HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, a, b); +#endif +} + +// Float +HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) { + return Vec256<float>{_mm256_min_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> Min(const Vec256<double> a, const Vec256<double> b) { + return Vec256<double>{_mm256_min_pd(a.raw, b.raw)}; +} + +// ------------------------------ Max (Gt, IfThenElse) + +// Unsigned +HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a, + const Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a, + const Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a, + const Vec256<uint64_t> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)}; +#else + const Full256<uint64_t> du; + const Full256<int64_t> di; + const auto msb = Set(du, 1ull << 63); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, a, b); +#endif +} + +// Signed +HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, b, a); +#endif +} + +// Float +HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) { + return Vec256<float>{_mm256_max_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) { + return Vec256<double>{_mm256_max_pd(a.raw, b.raw)}; +} + +// ------------------------------ Iota + +namespace detail { + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm256_set_epi8( + static_cast<char>(31), static_cast<char>(30), static_cast<char>(29), + static_cast<char>(28), static_cast<char>(27), static_cast<char>(26), + static_cast<char>(25), static_cast<char>(24), static_cast<char>(23), + static_cast<char>(22), static_cast<char>(21), static_cast<char>(20), + static_cast<char>(19), static_cast<char>(18), static_cast<char>(17), + static_cast<char>(16), static_cast<char>(15), static_cast<char>(14), + static_cast<char>(13), static_cast<char>(12), static_cast<char>(11), + static_cast<char>(10), static_cast<char>(9), static_cast<char>(8), + static_cast<char>(7), static_cast<char>(6), static_cast<char>(5), + static_cast<char>(4), static_cast<char>(3), static_cast<char>(2), + static_cast<char>(1), static_cast<char>(0))}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm256_set_epi16( + int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, int16_t{11}, + int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, int16_t{5}, + int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm256_set_epi32(int32_t{7}, int32_t{6}, int32_t{5}, + int32_t{4}, int32_t{3}, int32_t{2}, + int32_t{1}, int32_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{ + _mm256_set_epi64x(int64_t{3}, int64_t{2}, int64_t{1}, int64_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{ + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm256_set_pd(3.0, 2.0, 1.0, 0.0)}; +} + +} // namespace detail + +template <class D, typename T2, HWY_IF_V_SIZE_D(D, 32)> +HWY_API VFromD<D> Iota(D d, const T2 first) { + return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); +} + +// ------------------------------ FirstN (Iota, Lt) + +template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_D(D, 32)> +HWY_API M FirstN(const D d, size_t n) { +#if HWY_TARGET <= HWY_AVX3 + (void)d; + constexpr size_t kN = MaxLanes(d); +#if HWY_ARCH_X86_64 + const uint64_t all = (1ull << kN) - 1; + // BZHI only looks at the lower 8 bits of n! + return M::FromBits((n > 255) ? all : _bzhi_u64(all, n)); +#else + const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1); + // BZHI only looks at the lower 8 bits of n! + return M::FromBits((n > 255) ? all + : _bzhi_u32(all, static_cast<uint32_t>(n))); +#endif // HWY_ARCH_X86_64 +#else + const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. + using TI = TFromD<decltype(di)>; + return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(n))); +#endif +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +HWY_API Vec256<uint8_t> operator+(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> operator+(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> operator+(Vec256<uint32_t> a, Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> operator+(Vec256<uint64_t> a, Vec256<uint64_t> b) { + return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256<int8_t> operator+(Vec256<int8_t> a, Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> operator+(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> operator+(Vec256<int32_t> a, Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> operator+(Vec256<int64_t> a, Vec256<int64_t> b) { + return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec256<float> operator+(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_add_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> operator+(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_add_pd(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +HWY_API Vec256<uint8_t> operator-(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> operator-(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> operator-(Vec256<uint32_t> a, Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> operator-(Vec256<uint64_t> a, Vec256<uint64_t> b) { + return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256<int8_t> operator-(Vec256<int8_t> a, Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> operator-(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> operator-(Vec256<int32_t> a, Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> operator-(Vec256<int64_t> a, Vec256<int64_t> b) { + return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec256<float> operator-(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) { + return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())}; +} + +HWY_API Vec256<uint64_t> SumsOf8AbsDiff(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint64_t>{_mm256_sad_epu8(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec256<uint8_t> SaturatedAdd(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> SaturatedAdd(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256<int8_t> SaturatedAdd(Vec256<int8_t> a, Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)}; +} + +#if HWY_TARGET <= HWY_AVX3 +HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) { + const DFromV<decltype(a)> d; + const auto sum = a + b; + const auto overflow_mask = MaskFromVec( + Vec256<int32_t>{_mm256_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); + const auto i32_max = Set(d, LimitsMax<int32_t>()); + const Vec256<int32_t> overflow_result{_mm256_mask_ternarylogic_epi32( + i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, sum); +} + +HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) { + const DFromV<decltype(a)> d; + const auto sum = a + b; + const auto overflow_mask = MaskFromVec( + Vec256<int64_t>{_mm256_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); + const auto i64_max = Set(d, LimitsMax<int64_t>()); + const Vec256<int64_t> overflow_result{_mm256_mask_ternarylogic_epi64( + i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, sum); +} +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec256<uint8_t> SaturatedSub(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> SaturatedSub(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256<int8_t> SaturatedSub(Vec256<int8_t> a, Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)}; +} + +#if HWY_TARGET <= HWY_AVX3 +HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) { + const DFromV<decltype(a)> d; + const auto diff = a - b; + const auto overflow_mask = MaskFromVec( + Vec256<int32_t>{_mm256_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); + const auto i32_max = Set(d, LimitsMax<int32_t>()); + const Vec256<int32_t> overflow_result{_mm256_mask_ternarylogic_epi32( + i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, diff); +} + +HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) { + const DFromV<decltype(a)> d; + const auto diff = a - b; + const auto overflow_mask = MaskFromVec( + Vec256<int64_t>{_mm256_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); + const auto i64_max = Set(d, LimitsMax<int64_t>()); + const Vec256<int64_t> overflow_result{_mm256_mask_ternarylogic_epi64( + i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, diff); +} +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +HWY_API Vec256<uint8_t> AverageRound(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> AverageRound(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)}; +} + +// ------------------------------ Abs (Sub) + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec256<int8_t> Abs(Vec256<int8_t> v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (wrong result) + const DFromV<decltype(v)> d; + const auto zero = Zero(d); + return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)}; +#else + return Vec256<int8_t>{_mm256_abs_epi8(v.raw)}; +#endif +} +HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) { + return Vec256<int16_t>{_mm256_abs_epi16(v.raw)}; +} +HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_abs_epi32(v.raw)}; +} +// i64 is implemented after BroadcastSignBit. + +HWY_API Vec256<float> Abs(const Vec256<float> v) { + const DFromV<decltype(v)> d; + const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)}; + return v & BitCast(d, mask); +} +HWY_API Vec256<double> Abs(const Vec256<double> v) { + const DFromV<decltype(v)> d; + const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)}; + return v & BitCast(d, mask); +} + +// ------------------------------ Integer multiplication + +// Unsigned +HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)}; +} + +HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) { + return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) { + return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)}; +} + +// ------------------------------ ShiftLeft + +template <int kBits> +HWY_API Vec256<uint16_t> ShiftLeft(Vec256<uint16_t> v) { + return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<uint32_t> ShiftLeft(Vec256<uint32_t> v) { + return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<uint64_t> ShiftLeft(Vec256<uint64_t> v) { + return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<int16_t> ShiftLeft(Vec256<int16_t> v) { + return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<int32_t> ShiftLeft(Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<int64_t> ShiftLeft(Vec256<int64_t> v) { + return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)}; +} + +template <int kBits, typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) { + const Full256<T> d8; + const RepartitionToWide<decltype(d8)> d16; + const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v))); + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); +} + +// ------------------------------ ShiftRight + +template <int kBits> +HWY_API Vec256<uint16_t> ShiftRight(Vec256<uint16_t> v) { + return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<uint32_t> ShiftRight(Vec256<uint32_t> v) { + return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<uint64_t> ShiftRight(Vec256<uint64_t> v) { + return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<uint8_t> ShiftRight(Vec256<uint8_t> v) { + const Full256<uint8_t> d8; + // Use raw instead of BitCast to support N=1. + const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template <int kBits> +HWY_API Vec256<int16_t> ShiftRight(Vec256<int16_t> v) { + return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<int32_t> ShiftRight(Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) { + const Full256<int8_t> di; + const Full256<uint8_t> du; + const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// i64 is implemented after BroadcastSignBit. + +// ------------------------------ RotateRight + +template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API Vec256<T> RotateRight(const Vec256<T> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + // AVX3 does not support 8/16-bit. + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +template <int kBits> +HWY_API Vec256<uint32_t> RotateRight(const Vec256<uint32_t> v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v)); +#endif +} + +template <int kBits> +HWY_API Vec256<uint64_t> RotateRight(const Vec256<uint64_t> v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v)); +#endif +} + +// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) + +HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) { + const DFromV<decltype(v)> d; + return VecFromMask(v < Zero(d)); +} + +HWY_API Vec256<int16_t> BroadcastSignBit(const Vec256<int16_t> v) { + return ShiftRight<15>(v); +} + +HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) { + return ShiftRight<31>(v); +} + +HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) { +#if HWY_TARGET == HWY_AVX2 + const DFromV<decltype(v)> d; + return VecFromMask(v < Zero(d)); +#else + return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)}; +#endif +} + +template <int kBits> +HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<int64_t>{ + _mm256_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))}; +#else + const Full256<int64_t> di; + const Full256<uint64_t> du; + const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); + const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); + return right | sign; +#endif +} + +HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<int64_t>{_mm256_abs_epi64(v.raw)}; +#else + const DFromV<decltype(v)> d; + const auto zero = Zero(d); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +// ------------------------------ IfNegativeThenElse (BroadcastSignBit) +HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes, + Vec256<int8_t> no) { + // int8: AVX2 IfThenElse only looks at the MSB. + return IfThenElse(MaskFromVec(v), yes, no); +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + + // 16-bit: no native blendv, so copy sign to lower byte's MSB. + v = BitCast(d, BroadcastSignBit(BitCast(di, v))); + return IfThenElse(MaskFromVec(v), yes, no); +} + +template <typename T, HWY_IF_NOT_T_SIZE(T, 2)> +HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + const DFromV<decltype(v)> d; + const RebindToFloat<decltype(d)> df; + + // 32/64-bit: use float IfThenElse, which only looks at the MSB. + const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v)); + return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no))); +} + +// ------------------------------ ShiftLeftSame + +HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, bits)}; + } +#endif + return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, bits)}; + } +#endif + return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256<uint64_t> ShiftLeftSame(const Vec256<uint64_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, bits)}; + } +#endif + return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<int16_t>{_mm256_slli_epi16(v.raw, bits)}; + } +#endif + return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<int32_t>{_mm256_slli_epi32(v.raw, bits)}; + } +#endif + return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256<int64_t> ShiftLeftSame(const Vec256<int64_t> v, const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<int64_t>{_mm256_slli_epi64(v.raw, bits)}; + } +#endif + return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) { + const Full256<T> d8; + const RepartitionToWide<decltype(d8)> d16; + const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); +} + +// ------------------------------ ShiftRightSame (BroadcastSignBit) + +HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, bits)}; + } +#endif + return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, bits)}; + } +#endif + return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256<uint64_t> ShiftRightSame(const Vec256<uint64_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, bits)}; + } +#endif + return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) { + const Full256<uint8_t> d8; + const RepartitionToWide<decltype(d8)> d16; + const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits)); +} + +HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<int16_t>{_mm256_srai_epi16(v.raw, bits)}; + } +#endif + return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<int32_t>{_mm256_srai_epi32(v.raw, bits)}; + } +#endif + return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256<int64_t> ShiftRightSame(const Vec256<int64_t> v, + const int bits) { +#if HWY_TARGET <= HWY_AVX3 +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec256<int64_t>{ + _mm256_srai_epi64(v.raw, static_cast<Shift64Count>(bits))}; + } +#endif + return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +#else + const Full256<int64_t> di; + const Full256<uint64_t> du; + const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); + return right | sign; +#endif +} + +HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) { + const Full256<int8_t> di; + const Full256<uint8_t> du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = + BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits))); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Neg (Xor, Sub) + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template <typename T> +HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) { + const DFromV<decltype(v)> d; + return Xor(v, SignBit(d)); +} + +// Not floating-point +template <typename T> +HWY_INLINE Vec256<T> Neg(hwy::NonFloatTag /*tag*/, const Vec256<T> v) { + const DFromV<decltype(v)> d; + return Zero(d) - v; +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> Neg(const Vec256<T> v) { + return detail::Neg(hwy::IsFloatTag<T>(), v); +} + +// ------------------------------ Floating-point mul / div + +HWY_API Vec256<float> operator*(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> operator*(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)}; +} + +HWY_API Vec256<float> operator/(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_div_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> operator/(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_div_pd(a.raw, b.raw)}; +} + +// Approximate reciprocal +HWY_API Vec256<float> ApproximateReciprocal(Vec256<float> v) { + return Vec256<float>{_mm256_rcp_ps(v.raw)}; +} + +// Absolute value of difference. +HWY_API Vec256<float> AbsDiff(Vec256<float> a, Vec256<float> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x, + Vec256<float> add) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x + add; +#else + return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x, + Vec256<double> add) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x + add; +#else + return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns add - mul * x +HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x, + Vec256<float> add) { +#ifdef HWY_DISABLE_BMI2_FMA + return add - mul * x; +#else + return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x, + Vec256<double> add) { +#ifdef HWY_DISABLE_BMI2_FMA + return add - mul * x; +#else + return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns mul * x - sub +HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x, + Vec256<float> sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x - sub; +#else + return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x, + Vec256<double> sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x - sub; +#else + return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// Returns -mul * x - sub +HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x, + Vec256<float> sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return Neg(mul * x) - sub; +#else + return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x, + Vec256<double> sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return Neg(mul * x) - sub; +#else + return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// ------------------------------ Floating-point square root + +// Full precision square root +HWY_API Vec256<float> Sqrt(Vec256<float> v) { + return Vec256<float>{_mm256_sqrt_ps(v.raw)}; +} +HWY_API Vec256<double> Sqrt(Vec256<double> v) { + return Vec256<double>{_mm256_sqrt_pd(v.raw)}; +} + +// Approximate reciprocal square root +HWY_API Vec256<float> ApproximateReciprocalSqrt(Vec256<float> v) { + return Vec256<float>{_mm256_rsqrt_ps(v.raw)}; +} + +// ------------------------------ Floating-point rounding + +// Toward nearest integer, tie to even +HWY_API Vec256<float> Round(Vec256<float> v) { + return Vec256<float>{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256<double> Round(Vec256<double> v) { + return Vec256<double>{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} + +// Toward zero, aka truncate +HWY_API Vec256<float> Trunc(Vec256<float> v) { + return Vec256<float>{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256<double> Trunc(Vec256<double> v) { + return Vec256<double>{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} + +// Toward +infinity, aka ceiling +HWY_API Vec256<float> Ceil(Vec256<float> v) { + return Vec256<float>{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256<double> Ceil(Vec256<double> v) { + return Vec256<double>{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} + +// Toward -infinity, aka floor +HWY_API Vec256<float> Floor(Vec256<float> v) { + return Vec256<float>{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256<double> Floor(Vec256<double> v) { + return Vec256<double>{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} + +// ------------------------------ Floating-point classification + +HWY_API Mask256<float> IsNaN(Vec256<float> v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x81)}; +#else + return Mask256<float>{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)}; +#endif +} +HWY_API Mask256<double> IsNaN(Vec256<double> v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x81)}; +#else + return Mask256<double>{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)}; +#endif +} + +#if HWY_TARGET <= HWY_AVX3 + +HWY_API Mask256<float> IsInf(Vec256<float> v) { + return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x18)}; +} +HWY_API Mask256<double> IsInf(Vec256<double> v) { + return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x18)}; +} + +HWY_API Mask256<float> IsFinite(Vec256<float> v) { + // fpclass doesn't have a flag for positive, so we have to check for inf/NaN + // and negate the mask. + return Not(Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x99)}); +} +HWY_API Mask256<double> IsFinite(Vec256<double> v) { + return Not(Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x99)}); +} + +#else + +template <typename T> +HWY_API Mask256<T> IsInf(const Vec256<T> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + const VFromD<decltype(di)> vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>()))); +} + +// Returns whether normal/subnormal/zero. +template <typename T> +HWY_API Mask256<T> IsFinite(const Vec256<T> v) { + static_assert(IsFloat<T>(), "Only for float"); + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison + const VFromD<decltype(du)> vu = BitCast(du, v); + // Shift left to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). MSVC seems to generate + // incorrect code if we instead add vu + vu. + const VFromD<decltype(di)> exp = + BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ================================================== MEMORY + +// ------------------------------ Load + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API Vec256<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { + return Vec256<T>{ + _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) { + return Vec256<float>{_mm256_load_ps(aligned)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) { + return Vec256<double>{_mm256_load_pd(aligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API Vec256<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { + return Vec256<float>{_mm256_loadu_ps(p)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { + return Vec256<double>{_mm256_loadu_pd(p)}; +} + +// ------------------------------ MaskedLoad + +#if HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> MaskedLoad(Mask256<float> m, D /* tag */, + const float* HWY_RESTRICT p) { + return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D /* tag */, + const double* HWY_RESTRICT p) { + return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> MaskedLoadOr(VFromD<D> v, Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_mask_loadu_epi8(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> MaskedLoadOr(VFromD<D> v, Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_mask_loadu_epi16(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API Vec256<T> MaskedLoadOr(VFromD<D> v, Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_mask_loadu_epi32(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API Vec256<T> MaskedLoadOr(VFromD<D> v, Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec256<T>{_mm256_mask_loadu_epi64(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> MaskedLoadOr(VFromD<D> v, Mask256<float> m, D /* tag */, + const float* HWY_RESTRICT p) { + return Vec256<float>{_mm256_mask_loadu_ps(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> MaskedLoadOr(VFromD<D> v, Mask256<double> m, D /* tag */, + const double* HWY_RESTRICT p) { + return Vec256<double>{_mm256_mask_loadu_pd(v.raw, m.raw, p)}; +} + +#else // AVX2 + +// There is no maskload_epi8/16, so blend instead. +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D d, const T* HWY_RESTRICT p) { + return IfThenElseZero(m, LoadU(d, p)); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + auto pi = reinterpret_cast<const int*>(p); // NOLINT + return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + auto pi = reinterpret_cast<const long long*>(p); // NOLINT + return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> MaskedLoad(Mask256<float> m, D d, + const float* HWY_RESTRICT p) { + const Vec256<int32_t> mi = + BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m)); + return Vec256<float>{_mm256_maskload_ps(p, mi.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D d, + const double* HWY_RESTRICT p) { + const Vec256<int64_t> mi = + BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m)); + return Vec256<double>{_mm256_maskload_pd(p, mi.raw)}; +} + +#endif + +// ------------------------------ LoadDup128 + +// Loads 128 bit and duplicates into both 128-bit halves. This avoids the +// 3-cycle cost of moving data between 128-bit halves and avoids port 5. +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API Vec256<T> LoadDup128(D /* tag */, const T* HWY_RESTRICT p) { + const Full128<T> d128; +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 + // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note + // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the + // upper half undefined) is fine because we're overwriting that anyway. + // This workaround seems in turn to generate incorrect code in MSVC 2022 + // (19.31), so use broadcastsi128 there. + const __m128i v128 = LoadU(d128, p).raw; + return Vec256<T>{ + _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)}; +#else + // The preferred path. This is perhaps surprising, because vbroadcasti128 + // with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to + // pattern-match this to vbroadcastf128 with a memory operand as desired. + return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(d128, p).raw)}; +#endif +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) { +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 + const Full128<float> d128; + const __m128 v128 = LoadU(d128, p).raw; + return Vec256<float>{ + _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)}; +#else + return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))}; +#endif +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 + const Full128<double> d128; + const __m128d v128 = LoadU(d128, p).raw; + return Vec256<double>{ + _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)}; +#else + return Vec256<double>{ + _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))}; +#endif +} + +// ------------------------------ Store + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API void Store(Vec256<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void Store(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) { + _mm256_store_ps(aligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void Store(Vec256<double> v, D /* tag */, + double* HWY_RESTRICT aligned) { + _mm256_store_pd(aligned, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API void StoreU(Vec256<T> v, D /* tag */, T* HWY_RESTRICT p) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void StoreU(Vec256<float> v, D /* tag */, float* HWY_RESTRICT p) { + _mm256_storeu_ps(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void StoreU(Vec256<double> v, D /* tag */, double* HWY_RESTRICT p) { + _mm256_storeu_pd(p, v.raw); +} + +// ------------------------------ BlendedStore + +#if HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi8(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 2)> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi16(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi32(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi64(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, D /* tag */, + float* HWY_RESTRICT p) { + _mm256_mask_storeu_ps(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m, D /* tag */, + double* HWY_RESTRICT p) { + _mm256_mask_storeu_pd(p, m.raw, v.raw); +} + +#else // AVX2 + +// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD +// allows AC# if "Alignment checking enabled and: 256-bit memory operand not +// 32-byte aligned". Fortunately AC# is not enabled by default and requires both +// OS support (CR0) and the application to set rflags.AC. We assume these remain +// disabled because x86/x64 code and compiler output often contain misaligned +// scalar accesses, which would also fault. +// +// Caveat: these are slow on AMD Jaguar/Bulldozer. + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D d, T* HWY_RESTRICT p) { + // There is no maskload_epi8/16. Blending is also unsafe because loading a + // full vector that crosses the array end causes asan faults. Resort to scalar + // code; the caller should instead use memcpy, assuming m is FirstN(d, n). + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + alignas(32) TU buf[MaxLanes(d)]; + alignas(32) TU mask[MaxLanes(d)]; + Store(BitCast(du, v), du, buf); + Store(BitCast(du, VecFromMask(d, m)), du, mask); + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (mask[i]) { + CopySameSize(buf + i, p + i); + } + } +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D /* tag */, + T* HWY_RESTRICT p) { + auto pi = reinterpret_cast<int*>(p); // NOLINT + _mm256_maskstore_epi32(pi, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D /* tag */, + T* HWY_RESTRICT p) { + auto pi = reinterpret_cast<long long*>(p); // NOLINT + _mm256_maskstore_epi64(pi, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, D d, + float* HWY_RESTRICT p) { + const Vec256<int32_t> mi = + BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m)); + _mm256_maskstore_ps(p, mi.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m, D d, + double* HWY_RESTRICT p) { + const Vec256<int64_t> mi = + BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m)); + _mm256_maskstore_pd(p, mi.raw, v.raw); +} + +#endif + +// ------------------------------ Non-temporal stores + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_NOT_FLOAT(T)> +HWY_API void Stream(Vec256<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void Stream(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) { + _mm256_stream_ps(aligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void Stream(Vec256<double> v, D /* tag */, + double* HWY_RESTRICT aligned) { + _mm256_stream_pd(aligned, v.raw); +} + +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +#if HWY_TARGET <= HWY_AVX3 + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API void ScatterOffset(Vec256<T> v, D /* tag */, T* HWY_RESTRICT base, + Vec256<int32_t> offset) { + _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API void ScatterIndex(Vec256<T> v, D /* tag */, T* HWY_RESTRICT base, + Vec256<int32_t> index) { + _mm256_i32scatter_epi32(base, index.raw, v.raw, 4); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API void ScatterOffset(Vec256<T> v, D /* tag */, T* HWY_RESTRICT base, + Vec256<int64_t> offset) { + _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API void ScatterIndex(Vec256<T> v, D /* tag */, T* HWY_RESTRICT base, + Vec256<int64_t> index) { + _mm256_i64scatter_epi64(base, index.raw, v.raw, 8); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void ScatterOffset(Vec256<float> v, D /* tag */, + float* HWY_RESTRICT base, + const Vec256<int32_t> offset) { + _mm256_i32scatter_ps(base, offset.raw, v.raw, 1); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void ScatterIndex(Vec256<float> v, D /* tag */, + float* HWY_RESTRICT base, + const Vec256<int32_t> index) { + _mm256_i32scatter_ps(base, index.raw, v.raw, 4); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void ScatterOffset(Vec256<double> v, D /* tag */, + double* HWY_RESTRICT base, + const Vec256<int64_t> offset) { + _mm256_i64scatter_pd(base, offset.raw, v.raw, 1); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API void ScatterIndex(Vec256<double> v, D /* tag */, + double* HWY_RESTRICT base, + const Vec256<int64_t> index) { + _mm256_i64scatter_pd(base, index.raw, v.raw, 8); +} + +#else + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + typename Offset> +HWY_API void ScatterOffset(Vec256<T> v, D d, T* HWY_RESTRICT base, + const Vec256<Offset> offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(32) T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + alignas(32) Offset offset_lanes[MaxLanes(d)]; + Store(offset, Full256<Offset>(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); + for (size_t i = 0; i < MaxLanes(d); ++i) { + CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + typename Index> +HWY_API void ScatterIndex(Vec256<T> v, D d, T* HWY_RESTRICT base, + const Vec256<Index> index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(32) T lanes[MaxLanes(d)]; + Store(v, d, lanes); + + alignas(32) Index index_lanes[MaxLanes(d)]; + Store(index, Full256<Index>(), index_lanes); + + for (size_t i = 0; i < MaxLanes(d); ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +#endif + +// ------------------------------ Gather + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_INLINE Vec256<T> GatherOffset(D /* tag */, const T* HWY_RESTRICT base, + Vec256<int32_t> offset) { + return Vec256<T>{_mm256_i32gather_epi32( + reinterpret_cast<const int32_t*>(base), offset.raw, 1)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_INLINE Vec256<T> GatherIndex(D /* tag */, const T* HWY_RESTRICT base, + Vec256<int32_t> index) { + return Vec256<T>{_mm256_i32gather_epi32( + reinterpret_cast<const int32_t*>(base), index.raw, 4)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_INLINE Vec256<T> GatherOffset(D /* tag */, const T* HWY_RESTRICT base, + Vec256<int64_t> offset) { + return Vec256<T>{_mm256_i64gather_epi64( + reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_INLINE Vec256<T> GatherIndex(D /* tag */, const T* HWY_RESTRICT base, + Vec256<int64_t> index) { + return Vec256<T>{_mm256_i64gather_epi64( + reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> GatherOffset(D /* tag */, const float* HWY_RESTRICT base, + Vec256<int32_t> offset) { + return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<float> GatherIndex(D /* tag */, const float* HWY_RESTRICT base, + Vec256<int32_t> index) { + return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> GatherOffset(D /* tag */, + const double* HWY_RESTRICT base, + Vec256<int64_t> offset) { + return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 32)> +HWY_API Vec256<double> GatherIndex(D /* tag */, const double* HWY_RESTRICT base, + Vec256<int64_t> index) { + return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== SWIZZLE + +// ------------------------------ LowerHalf + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec128<T> LowerHalf(D /* tag */, Vec256<T> v) { + return Vec128<T>{_mm256_castsi256_si128(v.raw)}; +} +template <class D> +HWY_API Vec128<float> LowerHalf(D /* tag */, Vec256<float> v) { + return Vec128<float>{_mm256_castps256_ps128(v.raw)}; +} +template <class D> +HWY_API Vec128<double> LowerHalf(D /* tag */, Vec256<double> v) { + return Vec128<double>{_mm256_castpd256_pd128(v.raw)}; +} + +template <typename T> +HWY_API Vec128<T> LowerHalf(Vec256<T> v) { + const Full128<T> dh; + return LowerHalf(dh, v); +} + +// ------------------------------ UpperHalf + +template <class D, typename T = TFromD<D>> +HWY_API Vec128<T> UpperHalf(D /* tag */, Vec256<T> v) { + return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)}; +} +template <class D> +HWY_API Vec128<float> UpperHalf(D /* tag */, Vec256<float> v) { + return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)}; +} +template <class D> +HWY_API Vec128<double> UpperHalf(D /* tag */, Vec256<double> v) { + return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)}; +} + +// ------------------------------ ExtractLane (Store) +template <typename T> +HWY_API T ExtractLane(const Vec256<T> v, size_t i) { + const DFromV<decltype(v)> d; + HWY_DASSERT(i < Lanes(d)); + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, d, lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane (Store) +template <typename T> +HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) { + return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); +} + +// ------------------------------ GetLane (LowerHalf) +template <typename T> +HWY_API T GetLane(const Vec256<T> v) { + return GetLane(LowerHalf(v)); +} + +// ------------------------------ ZeroExtendVector + +// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper +// bits undefined. Although it makes sense for them to be zero (VEX encoded +// 128-bit instructions zero the upper lanes to avoid large penalties), a +// compiler could decide to optimize out code that relies on this. +// +// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the +// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For +// older GCC, we can still obtain the desired code thanks to pattern +// recognition; note that the expensive insert instruction is not actually +// generated, see https://gcc.godbolt.org/z/1MKGaP. + +#if !defined(HWY_HAVE_ZEXT) +#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \ + (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \ + (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000) +#define HWY_HAVE_ZEXT 1 +#else +#define HWY_HAVE_ZEXT 0 +#endif +#endif // defined(HWY_HAVE_ZEXT) + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ZeroExtendVector(D /* tag */, Vec128<T> lo) { +#if HWY_HAVE_ZEXT + return Vec256<T>{_mm256_zextsi128_si256(lo.raw)}; +#else + return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)}; +#endif +} +template <class D> +HWY_API Vec256<float> ZeroExtendVector(D /* tag */, Vec128<float> lo) { +#if HWY_HAVE_ZEXT + return Vec256<float>{_mm256_zextps128_ps256(lo.raw)}; +#else + return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)}; +#endif +} +template <class D> +HWY_API Vec256<double> ZeroExtendVector(D /* tag */, Vec128<double> lo) { +#if HWY_HAVE_ZEXT + return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)}; +#else + return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)}; +#endif +} + +// ------------------------------ ZeroExtendResizeBitCast + +namespace detail { + +template <class DTo, class DFrom> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */, + DTo d_to, DFrom d_from, VFromD<DFrom> v) { + const Twice<decltype(d_from)> dt_from; + const Twice<decltype(dt_from)> dq_from; + return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v))); +} + +} // namespace detail + +// ------------------------------ Combine + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> Combine(D d, Vec128<T> hi, Vec128<T> lo) { + const auto lo256 = ZeroExtendVector(d, lo); + return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)}; +} +template <class D> +HWY_API Vec256<float> Combine(D d, Vec128<float> hi, Vec128<float> lo) { + const auto lo256 = ZeroExtendVector(d, lo); + return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)}; +} +template <class D> +HWY_API Vec256<double> Combine(D d, Vec128<double> hi, Vec128<double> lo) { + const auto lo256 = ZeroExtendVector(d, lo); + return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)}; +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftLeftBytes(D /* tag */, const Vec256<T> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + // This is the same operation as _mm256_bslli_epi128. + return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)}; +} + +template <int kBytes, typename T> +HWY_API Vec256<T> ShiftLeftBytes(const Vec256<T> v) { + const DFromV<decltype(v)> d; + return ShiftLeftBytes<kBytes>(d, v); +} + +// ------------------------------ ShiftLeftLanes + +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftLeftLanes(D d, const Vec256<T> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); +} + +template <int kLanes, typename T> +HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) { + const DFromV<decltype(v)> d; + return ShiftLeftLanes<kLanes>(d, v); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftRightBytes(D /* tag */, const Vec256<T> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + // This is the same operation as _mm256_bsrli_epi128. + return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)}; +} + +// ------------------------------ ShiftRightLanes +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> ShiftRightLanes(D d, const Vec256<T> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v))); +} + +// ------------------------------ CombineShiftRightBytes +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec256<T> CombineShiftRightBytes(D d, Vec256<T> hi, Vec256<T> lo) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8( + BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); +} + +// ------------------------------ Broadcast + +// Unsigned +template <int kLane> +HWY_API Vec256<uint16_t> Broadcast(const Vec256<uint16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)}; + } else { + const __m256i hi = + _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)}; + } +} +template <int kLane> +HWY_API Vec256<uint32_t> Broadcast(const Vec256<uint32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template <int kLane> +HWY_API Vec256<uint64_t> Broadcast(const Vec256<uint64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Signed +template <int kLane> +HWY_API Vec256<int16_t> Broadcast(const Vec256<int16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)}; + } else { + const __m256i hi = + _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)}; + } +} +template <int kLane> +HWY_API Vec256<int32_t> Broadcast(const Vec256<int32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template <int kLane> +HWY_API Vec256<int64_t> Broadcast(const Vec256<int64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Float +template <int kLane> +HWY_API Vec256<float> Broadcast(Vec256<float> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; +} +template <int kLane> +HWY_API Vec256<double> Broadcast(const Vec256<double> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)}; +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is +// least-significant). Shuffle0321 rotates four-lane blocks one lane to the +// right (the previous least-significant lane is now most-significant => +// 47650321). These could also be implemented via CombineShiftRightBytes but +// the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template <typename T, HWY_IF_UI32(T)> +HWY_API Vec256<T> Shuffle2301(const Vec256<T> v) { + return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0xB1)}; +} +HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) { + return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)}; +} + +// Used by generic_ops-inl.h +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ShuffleTwo2301(const Vec256<T> a, const Vec256<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); + return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ShuffleTwo1230(const Vec256<T> a, const Vec256<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); + return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ShuffleTwo3012(const Vec256<T> a, const Vec256<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); + return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +} // namespace detail + +// Swap 64-bit halves +HWY_API Vec256<uint32_t> Shuffle1032(const Vec256<uint32_t> v) { + return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256<int32_t> Shuffle1032(const Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256<float> Shuffle1032(const Vec256<float> v) { + // Shorter encoding than _mm256_permute_ps. + return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)}; +} +HWY_API Vec256<uint64_t> Shuffle01(const Vec256<uint64_t> v) { + return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256<int64_t> Shuffle01(const Vec256<int64_t> v) { + return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256<double> Shuffle01(const Vec256<double> v) { + // Shorter encoding than _mm256_permute_pd. + return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)}; +} + +// Rotate right 32 bits +HWY_API Vec256<uint32_t> Shuffle0321(const Vec256<uint32_t> v) { + return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec256<int32_t> Shuffle0321(const Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec256<float> Shuffle0321(const Vec256<float> v) { + return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)}; +} +// Rotate left 32 bits +HWY_API Vec256<uint32_t> Shuffle2103(const Vec256<uint32_t> v) { + return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec256<int32_t> Shuffle2103(const Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec256<float> Shuffle2103(const Vec256<float> v) { + return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)}; +} + +// Reverse +HWY_API Vec256<uint32_t> Shuffle0123(const Vec256<uint32_t> v) { + return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec256<int32_t> Shuffle0123(const Vec256<int32_t> v) { + return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec256<float> Shuffle0123(const Vec256<float> v) { + return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template <typename T> +struct Indices256 { + __m256i raw; +}; + +// 8-bit lanes: indices remain unchanged +template <class D, typename T = TFromD<D>, typename TI, HWY_IF_T_SIZE(T, 1)> +HWY_API Indices256<T> IndicesFromVec(D /* tag */, Vec256<TI> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Full256<TI> di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T)))))); +#endif + return Indices256<T>{vec.raw}; +} + +// 16-bit lanes: convert indices to 32x8 unless AVX3 is available +template <class D, typename T = TFromD<D>, typename TI, HWY_IF_T_SIZE(T, 2)> +HWY_API Indices256<T> IndicesFromVec(D /* tag */, Vec256<TI> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); + const Full256<TI> di; +#if HWY_IS_DEBUG_BUILD + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T)))))); +#endif + +#if HWY_TARGET <= HWY_AVX3 + (void)di; + return Indices256<T>{vec.raw}; +#else + const Repartition<uint8_t, decltype(di)> d8; + using V8 = VFromD<decltype(d8)>; + alignas(32) static constexpr uint8_t kByteOffsets[32] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + + // Broadcast each lane index to all 2 bytes of T + alignas(32) static constexpr uint8_t kBroadcastLaneBytes[32] = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); + + // Shift to bytes + const Repartition<uint16_t, decltype(di)> d16; + const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); + + return Indices256<T>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; +#endif // HWY_TARGET <= HWY_AVX3 +} + +// Native 8x32 instruction: indices remain unchanged +template <class D, typename T = TFromD<D>, typename TI, HWY_IF_T_SIZE(T, 4)> +HWY_API Indices256<T> IndicesFromVec(D /* tag */, Vec256<TI> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Full256<TI> di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T)))))); +#endif + return Indices256<T>{vec.raw}; +} + +// 64-bit lanes: convert indices to 8x32 unless AVX3 is available +template <class D, typename T = TFromD<D>, typename TI, HWY_IF_T_SIZE(T, 8)> +HWY_API Indices256<T> IndicesFromVec(D d, Vec256<TI> idx64) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); + const Rebind<TI, decltype(d)> di; + (void)di; // potentially unused +#if HWY_IS_DEBUG_BUILD + HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) && + AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(64 / sizeof(T)))))); +#endif + +#if HWY_TARGET <= HWY_AVX3 + (void)d; + return Indices256<T>{idx64.raw}; +#else + const Repartition<float, decltype(d)> df; // 32-bit! + // Replicate 64-bit index into upper 32 bits + const Vec256<TI> dup = + BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)}); + // For each idx64 i, idx32 are 2*i and 2*i+1. + const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32); + return Indices256<T>{idx32.raw}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>, typename TI> +HWY_API Indices256<T> SetTableIndices(D d, const TI* idx) { + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<T>{_mm256_permutexvar_epi8(idx.raw, v.raw)}; +#else + const Vec256<T> idx_vec{idx.raw}; + const DFromV<decltype(v)> d; + const Repartition<uint16_t, decltype(d)> du16; + const auto sel_hi_mask = + MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); + + const auto a = ConcatLowerLower(d, v, v); + const auto b = ConcatUpperUpper(d, v, v); + const auto lo_lookup_result = TableLookupBytes(a, idx_vec); + +#if HWY_TARGET <= HWY_AVX3 + return Vec256<T>{_mm256_mask_shuffle_epi8( + lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; +#else + const auto hi_lookup_result = TableLookupBytes(b, idx_vec); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#endif // HWY_TARGET <= HWY_AVX3 +#endif // HWY_TARGET <= HWY_AVX3_DL +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<T>{_mm256_permutexvar_epi16(idx.raw, v.raw)}; +#else + const DFromV<decltype(v)> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast( + d, TableLookupLanes(BitCast(du8, v), Indices256<uint8_t>{idx.raw})); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { + return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)}; +#else + return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; +#endif +} + +HWY_API Vec256<float> TableLookupLanes(const Vec256<float> v, + const Indices256<float> idx) { + return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)}; +} + +HWY_API Vec256<double> TableLookupLanes(const Vec256<double> v, + const Indices256<double> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)}; +#else + const Full256<double> df; + const Full256<uint64_t> du; + return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32( + BitCast(du, v).raw, idx.raw)}); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b, + Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<T>{_mm256_permutex2var_epi8(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const auto sel_hi_mask = + MaskFromVec(BitCast(d, ShiftLeft<2>(Vec256<uint16_t>{idx.raw}))); + const auto lo_lookup_result = TableLookupLanes(a, idx); + const auto hi_lookup_result = TableLookupLanes(b, idx); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b, + Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<T>{_mm256_permutex2var_epi16(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const Repartition<uint8_t, decltype(d)> du8; + return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), + Indices256<uint8_t>{idx.raw})); +#endif +} + +template <typename T, HWY_IF_UI32(T)> +HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b, + Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<T>{_mm256_permutex2var_epi32(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + const Vec256<T> idx_vec{idx.raw}; + + const auto sel_hi_mask = MaskFromVec(BitCast(df, ShiftLeft<28>(idx_vec))); + const auto lo_lookup_result = BitCast(df, TableLookupLanes(a, idx)); + const auto hi_lookup_result = BitCast(df, TableLookupLanes(b, idx)); + return BitCast(d, + IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); +#endif +} + +HWY_API Vec256<float> TwoTablesLookupLanes(Vec256<float> a, Vec256<float> b, + Indices256<float> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<float>{_mm256_permutex2var_ps(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const auto sel_hi_mask = + MaskFromVec(BitCast(d, ShiftLeft<28>(Vec256<uint32_t>{idx.raw}))); + const auto lo_lookup_result = TableLookupLanes(a, idx); + const auto hi_lookup_result = TableLookupLanes(b, idx); + return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); +#endif +} + +template <typename T, HWY_IF_UI64(T)> +HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b, + Indices256<T> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<T>{_mm256_permutex2var_epi64(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const Repartition<uint32_t, decltype(d)> du32; + return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b), + Indices256<uint32_t>{idx.raw})); +#endif +} + +HWY_API Vec256<double> TwoTablesLookupLanes(Vec256<double> a, Vec256<double> b, + Indices256<double> idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<double>{_mm256_permutex2var_pd(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const Repartition<uint32_t, decltype(d)> du32; + return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b), + Indices256<uint32_t>{idx.raw})); +#endif +} + +// ------------------------------ SwapAdjacentBlocks + +template <typename T> +HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) { + return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(1, 0, 3, 2))}; +} + +HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) { + return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))}; +} + +HWY_API Vec256<float> SwapAdjacentBlocks(Vec256<float> v) { + // Assume no domain-crossing penalty between float/double (true on SKX). + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> dw; + return BitCast(d, SwapAdjacentBlocks(BitCast(dw, v))); +} + +// ------------------------------ Reverse (RotateRight) + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) { + alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) { + alignas(32) static constexpr int64_t kReverse[4] = {3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToSigned<decltype(d)> di; + alignas(32) static constexpr int16_t kReverse[16] = { + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + const Vec256<int16_t> idx = Load(di, kReverse); + return BitCast(d, Vec256<int16_t>{ + _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; + const auto rev128 = TableLookupBytes(v, LoadDup128(di, kShuffle)); + return Vec256<T>{ + _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))}; +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) { +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) static constexpr T kReverse[32] = { + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +#else + // First reverse bytes within blocks via PSHUFB, then swap blocks. + alignas(32) static constexpr T kReverse[32] = { + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + return SwapAdjacentBlocks(TableLookupBytes(v, Load(d, kReverse))); +#endif +} + +// ------------------------------ Reverse2 (in x86_128) + +// ------------------------------ Reverse4 (SwapAdjacentBlocks) + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> Reverse4(D d, const Vec256<T> v) { + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908}; + return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); +} + +// 32 bit Reverse4 defined in x86_128. + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Reverse4(D /* tag */, const Vec256<T> v) { + // Could also use _mm256_permute4x64_epi64. + return SwapAdjacentBlocks(Shuffle01(v)); +} + +// ------------------------------ Reverse8 + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> Reverse8(D d, const Vec256<T> v) { + const RebindToSigned<decltype(d)> di; + alignas(16) static constexpr int16_t kShuffle[8] = { + 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; + return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> Reverse8(D d, const Vec256<T> v) { + return Reverse(d, v); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Reverse8(D /* tag */, const Vec256<T> /* v */) { + HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes +} + +// ------------------------------ ReverseBits + +#if HWY_TARGET <= HWY_AVX3_DL +template <class V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_D(DFromV<V>, 32)> +HWY_API V ReverseBits(V v) { + const Full256<uint64_t> du64; + const auto affine_matrix = Set(du64, 0x8040201008040201u); + return V{_mm256_gf2p8affine_epi64_epi8(v.raw, affine_matrix.raw, 0)}; +} +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +HWY_API Vec256<uint8_t> InterleaveLower(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> InterleaveLower(Vec256<uint16_t> a, + Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> InterleaveLower(Vec256<uint32_t> a, + Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> InterleaveLower(Vec256<uint64_t> a, + Vec256<uint64_t> b) { + return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256<int8_t> InterleaveLower(Vec256<int8_t> a, Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> InterleaveLower(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> InterleaveLower(Vec256<int32_t> a, Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> InterleaveLower(Vec256<int64_t> a, Vec256<int64_t> b) { + return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256<float> InterleaveLower(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> InterleaveLower(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)}; +} + +// ------------------------------ InterleaveUpper + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec256<uint8_t> InterleaveUpper(Vec256<uint8_t> a, Vec256<uint8_t> b) { + return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<uint16_t> InterleaveUpper(Vec256<uint16_t> a, + Vec256<uint16_t> b) { + return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<uint32_t> InterleaveUpper(Vec256<uint32_t> a, + Vec256<uint32_t> b) { + return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> InterleaveUpper(Vec256<uint64_t> a, + Vec256<uint64_t> b) { + return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256<int8_t> InterleaveUpper(Vec256<int8_t> a, Vec256<int8_t> b) { + return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec256<int16_t> InterleaveUpper(Vec256<int16_t> a, Vec256<int16_t> b) { + return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec256<int32_t> InterleaveUpper(Vec256<int32_t> a, Vec256<int32_t> b) { + return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> InterleaveUpper(Vec256<int64_t> a, Vec256<int64_t> b) { + return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256<float> InterleaveUpper(Vec256<float> a, Vec256<float> b) { + return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)}; +} +HWY_API Vec256<double> InterleaveUpper(Vec256<double> a, Vec256<double> b) { + return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> InterleaveUpper(D /* tag */, Vec256<T> a, Vec256<T> b) { + return detail::InterleaveUpper(a, b); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <typename T, typename TW = MakeWide<T>> +HWY_API Vec256<TW> ZipLower(Vec256<T> a, Vec256<T> b) { + const Full256<TW> dw; + return BitCast(dw, InterleaveLower(a, b)); +} +template <class DW, typename TN = MakeNarrow<TFromD<DW>>> +HWY_API VFromD<DW> ZipLower(DW dw, Vec256<TN> a, Vec256<TN> b) { + return BitCast(dw, InterleaveLower(a, b)); +} + +template <class DW, typename TN = MakeNarrow<TFromD<DW>>> +HWY_API VFromD<DW> ZipUpper(DW dw, Vec256<TN> a, Vec256<TN> b) { + const RepartitionToNarrow<decltype(dw)> dn; + return BitCast(dw, InterleaveUpper(dn, a, b)); +} + +// ------------------------------ Blocks (LowerHalf, ZeroExtendVector) + +// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL. +// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no +// extra cost) for LowerLower and UpperLower. + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatLowerLower(D d, Vec256<T> hi, Vec256<T> lo) { + const Half<decltype(d)> d2; + return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)}; +} +template <class D> +HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi, + Vec256<float> lo) { + const Half<decltype(d)> d2; + return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)}; +} +template <class D> +HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi, + Vec256<double> lo) { + const Half<decltype(d)> d2; + return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)}; +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatLowerUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)}; +} +template <class D> +HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi, + Vec256<float> lo) { + return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)}; +} +template <class D> +HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi, + Vec256<double> lo) { + return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)}; +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatUpperLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)}; +} +template <class D> +HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi, + Vec256<float> lo) { + return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)}; +} +template <class D> +HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi, + Vec256<double> lo) { + return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)}; +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ConcatUpperUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) { + return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)}; +} +template <class D> +HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi, + Vec256<float> lo) { + return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)}; +} +template <class D> +HWY_API Vec256<double> ConcatUpperUpper(D /* tag */, Vec256<double> hi, + Vec256<double> lo) { + return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)}; +} + +// ------------------------------ ConcatOdd + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) static constexpr uint8_t kIdx[32] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; + return BitCast( + d, Vec256<uint16_t>{_mm256_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RepartitionToWide<decltype(du)> dw; + // Unsigned 8-bit shift so we can pack. + const Vec256<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec256<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); + const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); + return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(32) static constexpr uint16_t kIdx[16] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + return BitCast( + d, Vec256<uint16_t>{_mm256_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RepartitionToWide<decltype(du)> dw; + // Unsigned 16-bit shift so we can pack. + const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo)); + const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); + return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return BitCast( + d, Vec256<uint32_t>{_mm256_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RebindToFloat<decltype(d)> df; + const Vec256<float> v3131{_mm256_shuffle_ps( + BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))}; + return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw, + _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template <class D> +HWY_API Vec256<float> ConcatOdd(D d, Vec256<float> hi, Vec256<float> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return Vec256<float>{ + _mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; +#else + const Vec256<float> v3131{ + _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; + return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64( + BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))}); +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; + return BitCast( + d, Vec256<uint64_t>{_mm256_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RebindToFloat<decltype(d)> df; + const Vec256<double> v31{ + _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)}; + return Vec256<T>{ + _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template <class D> +HWY_API Vec256<double> ConcatOdd(D d, Vec256<double> hi, Vec256<double> lo) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; + return Vec256<double>{ + _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; +#else + (void)d; + const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)}; + return Vec256<double>{ + _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +// ------------------------------ ConcatEven + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(64) static constexpr uint8_t kIdx[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + return BitCast( + d, Vec256<uint32_t>{_mm256_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RepartitionToWide<decltype(du)> dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec256<uint16_t> mask = Set(dw, 0x00FF); + const Vec256<uint16_t> uH = And(BitCast(dw, hi), mask); + const Vec256<uint16_t> uL = And(BitCast(dw, lo), mask); + const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); + return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) static constexpr uint16_t kIdx[16] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; + return BitCast( + d, Vec256<uint32_t>{_mm256_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RepartitionToWide<decltype(du)> dw; + // Isolate lower 16 bits per u32 so we can pack. + const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF); + const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask); + const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask); + const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); + return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return BitCast( + d, Vec256<uint32_t>{_mm256_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RebindToFloat<decltype(d)> df; + const Vec256<float> v2020{_mm256_shuffle_ps( + BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))}; + return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw, + _MM_SHUFFLE(3, 1, 2, 0))}; + +#endif +} + +template <class D> +HWY_API Vec256<float> ConcatEven(D d, Vec256<float> hi, Vec256<float> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return Vec256<float>{ + _mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; +#else + const Vec256<float> v2020{ + _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; + return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64( + BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))}); + +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; + return BitCast( + d, Vec256<uint64_t>{_mm256_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RebindToFloat<decltype(d)> df; + const Vec256<double> v20{ + _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)}; + return Vec256<T>{ + _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))}; + +#endif +} + +template <class D> +HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; + return Vec256<double>{ + _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; +#else + (void)d; + const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)}; + return Vec256<double>{ + _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +// ------------------------------ DupEven (InterleaveLower) + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> DupEven(Vec256<T> v) { + return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} +HWY_API Vec256<float> DupEven(Vec256<float> v) { + return Vec256<float>{ + _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> DupEven(const Vec256<T> v) { + const DFromV<decltype(v)> d; + return InterleaveLower(d, v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> DupOdd(Vec256<T> v) { + return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} +HWY_API Vec256<float> DupOdd(Vec256<float> v) { + return Vec256<float>{ + _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> DupOdd(const Vec256<T> v) { + const DFromV<decltype(v)> d; + return InterleaveUpper(d, v, v); +} + +// ------------------------------ OddEven + +namespace detail { + +template <typename T> +HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec256<T> a, + const Vec256<T> b) { + const DFromV<decltype(a)> d; + const Full256<uint8_t> d8; + alignas(32) static constexpr uint8_t mask[16] = { + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a); +} +template <typename T> +HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec256<T> a, + const Vec256<T> b) { + return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)}; +} +template <typename T> +HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec256<T> a, + const Vec256<T> b) { + return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)}; +} +template <typename T> +HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec256<T> a, + const Vec256<T> b) { + return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) { + return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b); +} +HWY_API Vec256<float> OddEven(const Vec256<float> a, const Vec256<float> b) { + return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)}; +} + +HWY_API Vec256<double> OddEven(const Vec256<double> a, const Vec256<double> b) { + return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)}; +} + +// ------------------------------ OddEvenBlocks + +template <typename T> +Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) { + return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)}; +} + +HWY_API Vec256<float> OddEvenBlocks(Vec256<float> odd, Vec256<float> even) { + return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)}; +} + +HWY_API Vec256<double> OddEvenBlocks(Vec256<double> odd, Vec256<double> even) { + return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)}; +} + +// ------------------------------ ReverseBlocks (SwapAdjacentBlocks) + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> ReverseBlocks(D /*d*/, Vec256<T> v) { + return SwapAdjacentBlocks(v); +} + +// ------------------------------ TableLookupBytes (ZeroExtendVector) + +// Both full +template <typename T, typename TI> +HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) { + return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)}; +} + +// Partial index vector +template <typename T, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, Vec128<TI, NI> from) { + const Full256<TI> di; + const Half<decltype(di)> dih; + // First expand to full 128, then 256. + const auto from_256 = ZeroExtendVector(di, Vec128<TI>{from.raw}); + const auto tbl_full = TableLookupBytes(bytes, from_256); + // Shrink to 128, then partial. + return Vec128<TI, NI>{LowerHalf(dih, tbl_full).raw}; +} + +// Partial table vector +template <typename T, size_t N, typename TI> +HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, Vec256<TI> from) { + const Full256<T> d; + // First expand to full 128, then 256. + const auto bytes_256 = ZeroExtendVector(d, Vec128<T>{bytes.raw}); + return TableLookupBytes(bytes_256, from); +} + +// Partial both are handled by x86_128. + +// ------------------------------ Shl (Mul, ZipLower) + +namespace detail { + +#if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older + +// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. +template <typename T> +HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) { + static_assert(sizeof(T) == 2, "Only for 16-bit"); + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> dw; + const Rebind<float, decltype(dw)> df; + const auto zero = Zero(d); + // Move into exponent (this u16 will become the upper half of an f32) + const auto exp = ShiftLeft<23 - 16>(v); + const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f + // Insert 0 into lower halves for reinterpreting as binary32. + const auto f0 = ZipLower(dw, zero, upper); + const auto f1 = ZipUpper(dw, zero, upper); + // Do not use ConvertTo because it checks for overflow, which is redundant + // because we only care about v in [0, 16). + const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)}; + const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)}; + return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)}; +} + +#endif // HWY_TARGET > HWY_AVX3 + +HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v, + Vec256<uint16_t> bits) { +#if HWY_TARGET <= HWY_AVX3 || HWY_IDE + return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)}; +#else + return v * Pow2(bits); +#endif +} + +// 8-bit: may use the Shl overload for uint16_t. +HWY_API Vec256<uint8_t> Shl(hwy::UnsignedTag tag, Vec256<uint8_t> v, + Vec256<uint8_t> bits) { + const DFromV<decltype(v)> d; +#if HWY_TARGET <= HWY_AVX3_DL + (void)tag; + // kMask[i] = 0xFF >> i + alignas(16) static constexpr uint8_t kMasks[16] = { + 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; + // kShl[i] = 1 << i + alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, + 0x20, 0x40, 0x80, 0x00}; + v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits)); + const VFromD<decltype(d)> mul = TableLookupBytes(LoadDup128(d, kShl), bits); + return VFromD<decltype(d)>{_mm256_gf2p8mul_epi8(v.raw, mul.raw)}; +#else + const Repartition<uint16_t, decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + const VW mask = Set(dw, 0x00FF); + const VW vw = BitCast(dw, v); + const VW bits16 = BitCast(dw, bits); + const VW evens = Shl(tag, And(vw, mask), And(bits16, mask)); + // Shift odd lanes in-place + const VW odds = Shl(tag, vw, ShiftRight<8>(bits16)); + return BitCast(d, IfVecThenElse(Set(dw, 0xFF00), odds, evens)); +#endif +} + +HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v, + Vec256<uint32_t> bits) { + return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)}; +} + +HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v, + Vec256<uint64_t> bits) { + return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)}; +} + +template <typename T> +HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) { + // Signed left shifts are the same as unsigned. + const Full256<T> di; + const Full256<MakeUnsigned<T>> du; + return BitCast(di, + Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); +} + +} // namespace detail + +template <typename T> +HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) { + return detail::Shl(hwy::TypeTag<T>(), v, bits); +} + +// ------------------------------ Shr (MulHigh, IfThenElse, Not) + +HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) { +#if HWY_TARGET <= HWY_AVX3 || HWY_IDE + return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)}; +#else + Full256<uint16_t> d; + // For bits=0, we cannot mul by 2^16, so fix the result later. + auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits)); + // Replace output with input where bits == 0. + return IfThenElse(bits == Zero(d), v, out); +#endif +} + +// 8-bit uses 16-bit shifts. +HWY_API Vec256<uint8_t> operator>>(Vec256<uint8_t> v, Vec256<uint8_t> bits) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + const VW mask = Set(dw, 0x00FF); + const VW vw = BitCast(dw, v); + const VW bits16 = BitCast(dw, bits); + const VW evens = And(vw, mask) >> And(bits16, mask); + // Shift odd lanes in-place + const VW odds = vw >> ShiftRight<8>(bits16); + return BitCast(d, IfVecThenElse(Set(dw, 0xFF00), odds, evens)); +} + +HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) { + return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) { + return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)}; +} + +HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)}; +#else + const DFromV<decltype(v)> d; + return detail::SignedShr(d, v, bits); +#endif +} + +HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) { + return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)}; +#else + const DFromV<decltype(v)> d; + return detail::SignedShr(d, v, bits); +#endif +} + +HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a, + const Vec256<uint64_t> b) { + const Full256<uint64_t> du64; + const RepartitionToNarrow<decltype(du64)> du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need the lower 32 bits + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need + // the even (lower 64 bits of every 128-bit block) results. See + // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveLower(mulL, mulH); +} + +HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a, + const Vec256<uint64_t> b) { + const Full256<uint64_t> du64; + const RepartitionToNarrow<decltype(du64)> du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need bits [95:64] (= upper half of input) + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Same as above, but we're using the odd results (upper 64 bits per block). + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveUpper(du64, mulL, mulH); +} + +// ------------------------------ WidenMulPairwiseAdd +template <class D, HWY_IF_SIGNED_D(D)> +HWY_API Vec256<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a, + Vec256<int16_t> b) { + return Vec256<int32_t>{_mm256_madd_epi16(a.raw, b.raw)}; +} + +// ------------------------------ ReorderWidenMulAccumulate +template <class D, HWY_IF_SIGNED_D(D)> +HWY_API Vec256<int32_t> ReorderWidenMulAccumulate(D d, Vec256<int16_t> a, + Vec256<int16_t> b, + const Vec256<int32_t> sum0, + Vec256<int32_t>& /*sum1*/) { + (void)d; +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<int32_t>{_mm256_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; +#else + return sum0 + WidenMulPairwiseAdd(d, a, b); +#endif +} + +// ------------------------------ RearrangeToOddPlusEven +HWY_API Vec256<int32_t> RearrangeToOddPlusEven(const Vec256<int32_t> sum0, + Vec256<int32_t> /*sum1*/) { + return sum0; // invariant already holds +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec256<double> PromoteTo(D /* tag */, Vec128<float> v) { + return Vec256<double>{_mm256_cvtps_pd(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec256<double> PromoteTo(D /* tag */, Vec128<int32_t> v) { + return Vec256<double>{_mm256_cvtepi32_pd(v.raw)}; +} + +// Unsigned: zero-extend. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then Zip* would be faster. +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec256<uint16_t> PromoteTo(D /* tag */, Vec128<uint8_t> v) { + return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)}; +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> PromoteTo(D /* tag */, Vec128<uint8_t, 8> v) { + return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)}; +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> PromoteTo(D /* tag */, Vec128<uint16_t> v) { + return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec256<uint64_t> PromoteTo(D /* tag */, Vec128<uint32_t> v) { + return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec256<uint64_t> PromoteTo(D /* tag */, Vec64<uint16_t> v) { + return Vec256<uint64_t>{_mm256_cvtepu16_epi64(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec256<uint64_t> PromoteTo(D /* tag */, Vec32<uint8_t> v) { + return Vec256<uint64_t>{_mm256_cvtepu8_epi64(v.raw)}; +} + + +// Signed: replicate sign bit. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by +// signed shift would be faster. +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec256<int16_t> PromoteTo(D /* tag */, Vec128<int8_t> v) { + return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec256<int32_t> PromoteTo(D /* tag */, Vec128<int8_t, 8> v) { + return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec256<int32_t> PromoteTo(D /* tag */, Vec128<int16_t> v) { + return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec256<int64_t> PromoteTo(D /* tag */, Vec128<int32_t> v) { + return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec256<int64_t> PromoteTo(D /* tag */, Vec64<int16_t> v) { + return Vec256<int64_t>{_mm256_cvtepi16_epi64(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec256<int64_t> PromoteTo(D /* tag */, Vec32<int8_t> v) { + return Vec256<int64_t>{_mm256_cvtepi8_epi64(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw); + // Concatenating lower halves of both 128-bit blocks afterward is more + // efficient than an extra input with low block = high block of v. + return Vec128<uint16_t>{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> DemoteTo(D dn, Vec256<uint32_t> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw); + return Vec128<int16_t>{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); + // Concatenate lower 64 bits of each 128-bit block + const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); + const __m128i i16 = _mm256_castsi256_si128(i16_concat); + return Vec64<uint8_t>{_mm_packus_epi16(i16, i16)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D dn, Vec256<uint32_t> v) { +#if HWY_TARGET <= HWY_AVX3 + (void)dn; + return Vec64<uint8_t>{_mm256_cvtusepi32_epi8(v.raw)}; +#else + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); +#endif +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) { + const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw); + return Vec128<uint8_t>{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> DemoteTo(D dn, Vec256<uint16_t> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu)))); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { + const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); + // Concatenate lower 64 bits of each 128-bit block + const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); + const __m128i i16 = _mm256_castsi256_si128(i16_concat); + return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)}; +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) { + const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw); + return Vec128<int8_t>{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))}; +} + +#if HWY_TARGET <= HWY_AVX3 +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> DemoteTo(D /* tag */, Vec256<int64_t> v) { + return Vec128<int32_t>{_mm256_cvtsepi64_epi32(v.raw)}; +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec64<int16_t> DemoteTo(D /* tag */, Vec256<int64_t> v) { + return Vec64<int16_t>{_mm256_cvtsepi64_epi16(v.raw)}; +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec32<int8_t> DemoteTo(D /* tag */, Vec256<int64_t> v) { + return Vec32<int8_t>{_mm256_cvtsepi64_epi8(v.raw)}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> DemoteTo(D /* tag */, Vec256<int64_t> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return Vec128<uint32_t>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec256<int64_t> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return Vec64<uint16_t>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec256<int64_t> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return Vec32<uint8_t>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> DemoteTo(D /* tag */, Vec256<uint64_t> v) { + return Vec128<uint32_t>{_mm256_cvtusepi64_epi32(v.raw)}; +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec256<uint64_t> v) { + return Vec64<uint16_t>{_mm256_cvtusepi64_epi16(v.raw)}; +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec256<uint64_t> v) { + return Vec32<uint8_t>{_mm256_cvtusepi64_epi8(v.raw)}; +} +#endif // HWY_TARGET <= HWY_AVX3 + +// Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'". +// 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion") + +template <class D, HWY_IF_F16_D(D)> +HWY_API Vec128<float16_t> DemoteTo(D df16, Vec256<float> v) { +#ifdef HWY_DISABLE_F16C + const RebindToUnsigned<decltype(df16)> du16; + const Rebind<uint32_t, decltype(df16)> du; + const RebindToSigned<decltype(du)> di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return BitCast(df16, DemoteTo(du16, bits16)); +#else + (void)df16; + return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; +#endif +} + +HWY_DIAGNOSTICS(pop) + +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec128<bfloat16_t> DemoteTo(D dbf16, Vec256<float> v) { + // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16. + const Rebind<int32_t, decltype(dbf16)> di32; + const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right + const Rebind<uint16_t, decltype(dbf16)> du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16, + Vec256<float> a, Vec256<float> b) { + // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16. + const RebindToUnsigned<decltype(dbf16)> du16; + const Repartition<uint32_t, decltype(dbf16)> du32; + const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec256<int16_t> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a, + Vec256<int32_t> b) { + return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec256<uint16_t> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a, + Vec256<int32_t> b) { + return Vec256<uint16_t>{_mm256_packus_epi32(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec256<uint16_t> ReorderDemote2To(D dn, Vec256<uint32_t> a, + Vec256<uint32_t> b) { + const DFromV<decltype(a)> d; + const RebindToSigned<decltype(d)> di; + const auto max_i32 = Set(d, 0x7FFFFFFFu); + return ReorderDemote2To(dn, BitCast(di, Min(a, max_i32)), + BitCast(di, Min(b, max_i32))); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec256<int8_t> ReorderDemote2To(D /*d16*/, Vec256<int16_t> a, + Vec256<int16_t> b) { + return Vec256<int8_t>{_mm256_packs_epi16(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec256<uint8_t> ReorderDemote2To(D /*d16*/, Vec256<int16_t> a, + Vec256<int16_t> b) { + return Vec256<uint8_t>{_mm256_packus_epi16(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec256<uint8_t> ReorderDemote2To(D dn, Vec256<uint16_t> a, + Vec256<uint16_t> b) { + const DFromV<decltype(a)> d; + const RebindToSigned<decltype(d)> di; + const auto max_i16 = Set(d, 0x7FFFu); + return ReorderDemote2To(dn, BitCast(di, Min(a, max_i16)), + BitCast(di, Min(b, max_i16))); +} + +#if HWY_TARGET > HWY_AVX3 +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> +HWY_API Vec256<int32_t> ReorderDemote2To(D dn, Vec256<int64_t> a, + Vec256<int64_t> b) { + const DFromV<decltype(a)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const Half<decltype(dn)> dnh; + const Repartition<float, decltype(dn)> dn_f; + + // Negative values are saturated by first saturating their bitwise inverse + // and then inverting the saturation result + const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); + const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); + const auto saturated_a = Xor( + invert_mask_a, + detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); + const auto saturated_b = Xor( + invert_mask_b, + detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); + + return BitCast(dn, + Vec256<float>{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, + BitCast(dn_f, saturated_b).raw, + _MM_SHUFFLE(2, 0, 2, 0))}); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<int64_t> a, + Vec256<int64_t> b) { + const DFromV<decltype(a)> di64; + const RebindToUnsigned<decltype(di64)> du64; + const Half<decltype(dn)> dnh; + const Repartition<float, decltype(dn)> dn_f; + + const auto saturated_a = detail::DemoteFromU64Saturate( + dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); + const auto saturated_b = detail::DemoteFromU64Saturate( + dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); + + return BitCast(dn, + Vec256<float>{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, + BitCast(dn_f, saturated_b).raw, + _MM_SHUFFLE(2, 0, 2, 0))}); +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<uint64_t> a, + Vec256<uint64_t> b) { + const Half<decltype(dn)> dnh; + const Repartition<float, decltype(dn)> dn_f; + + const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); + const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); + + return BitCast(dn, + Vec256<float>{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, + BitCast(dn_f, saturated_b).raw, + _MM_SHUFFLE(2, 0, 2, 0))}); +} +#endif // HWY_TARGET > HWY_AVX3 + +template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), + HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2), + HWY_IF_T_SIZE_ONE_OF_V(V, + (1 << 1) | (1 << 2) | (1 << 4) | + ((HWY_TARGET > HWY_AVX3) ? (1 << 8) : 0))> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + return VFromD<D>{_mm256_permute4x64_epi64(ReorderDemote2To(d, a, b).raw, + _MM_SHUFFLE(3, 1, 2, 0))}; +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec128<float> DemoteTo(D /* tag */, Vec256<double> v) { + return Vec128<float>{_mm256_cvtpd_ps(v.raw)}; +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec128<int32_t> DemoteTo(D /* tag */, Vec256<double> v) { + const Full256<double> d64; + const auto clamped = detail::ClampF64ToI32Max(d64, v); + return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)}; +} + +// For already range-limited input [0, 255]. +HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) { + const Full256<uint32_t> d32; + const Full64<uint8_t> d8; + alignas(32) static constexpr uint32_t k8From32[8] = { + 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u}; + // Place first four bytes in lo[0], remaining 4 in hi[1]. + const auto quad = TableLookupBytes(v, Load(d32, k8From32)); + // Interleave both quadruplets - OR instead of unpack reduces port5 pressure. + const auto lo = LowerHalf(quad); + const auto hi = UpperHalf(Half<decltype(d32)>(), quad); + return BitCast(d8, LowerHalf(lo | hi)); +} + +// ------------------------------ Truncations + +namespace detail { + +// LO and HI each hold four indices of bytes within a 128-bit block. +template <uint32_t LO, uint32_t HI, typename T> +HWY_INLINE Vec128<uint32_t> LookupAndConcatHalves(Vec256<T> v) { + const Full256<uint32_t> d32; + +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) static constexpr uint32_t kMap[8] = { + LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0}; + const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw); +#else + alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u, + ~0u, ~0u, LO, HI}; + const auto quad = TableLookupBytes(v, Load(d32, kMap)); + const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC); + // Possible alternative: + // const auto lo = LowerHalf(quad); + // const auto hi = UpperHalf(Half<decltype(d32)>(), quad); + // const auto result = lo | hi; +#endif + + return Vec128<uint32_t>{_mm256_castsi256_si128(result)}; +} + +// LO and HI each hold two indices of bytes within a 128-bit block. +template <uint16_t LO, uint16_t HI, typename T> +HWY_INLINE Vec128<uint32_t, 2> LookupAndConcatQuarters(Vec256<T> v) { + const Full256<uint16_t> d16; + +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) static constexpr uint16_t kMap[16] = { + LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const auto result = _mm256_permutexvar_epi8(Load(d16, kMap).raw, v.raw); + return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)}); +#else + constexpr uint16_t ff = static_cast<uint16_t>(~0u); + alignas(32) static constexpr uint16_t kMap[16] = { + LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff}; + const auto quad = TableLookupBytes(v, Load(d16, kMap)); + const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC); + const auto half = _mm256_castsi256_si128(mixed); + return LowerHalf(Vec128<uint32_t>{_mm_packus_epi32(half, half)}); +#endif +} + +} // namespace detail + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec32<uint8_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { + const Full256<uint32_t> d32; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) static constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, + 0, 0, 0, 0}; + const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw); + return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result}))); +#else + alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u, + 0x0800FFFFu, ~0u, ~0u, ~0u}; + const auto quad = TableLookupBytes(v, Load(d32, kMap)); + const auto lo = LowerHalf(quad); + const auto hi = UpperHalf(Half<decltype(d32)>(), quad); + const auto result = lo | hi; + return LowerHalf(LowerHalf(Vec128<uint8_t>{result.raw})); +#endif +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec64<uint16_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { + const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v); + return Vec64<uint16_t>{result.raw}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec128<uint32_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { + const Full256<uint32_t> d32; + alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto v32 = + TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven)); + return LowerHalf(Vec256<uint32_t>{v32.raw}); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) { + const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v); + return Vec64<uint8_t>{full.raw}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) { + const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v); + return Vec128<uint16_t>{full.raw}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> TruncateTo(D /* tag */, Vec256<uint16_t> v) { + const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v); + return Vec128<uint8_t>{full.raw}; +} + +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec256<float> ConvertTo(D /* tag */, Vec256<int32_t> v) { + return Vec256<float>{_mm256_cvtepi32_ps(v.raw)}; +} + +#if HWY_TARGET <= HWY_AVX3 +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec256<float> ConvertTo(D /*df*/, Vec256<uint32_t> v) { + return Vec256<float>{_mm256_cvtepu32_ps(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec256<double> ConvertTo(D /*dd*/, Vec256<int64_t> v) { + return Vec256<double>{_mm256_cvtepi64_pd(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec256<double> ConvertTo(D /*dd*/, Vec256<uint64_t> v) { + return Vec256<double>{_mm256_cvtepu64_pd(v.raw)}; +} +#endif // HWY_TARGET <= HWY_AVX3 + +// Truncates (rounds toward zero). +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec256<int32_t> ConvertTo(D d, Vec256<float> v) { + return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw)); +} + +#if HWY_TARGET <= HWY_AVX3 +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec256<int64_t> ConvertTo(D di, Vec256<double> v) { + return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw)); +} +#endif + +HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) { + const Full256<int32_t> di; + return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw)); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec256<float> PromoteTo(D df32, Vec128<float16_t> v) { +#ifdef HWY_DISABLE_F16C + const RebindToSigned<decltype(df32)> di32; + const RebindToUnsigned<decltype(df32)> du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +#else + (void)df32; + return Vec256<float>{_mm256_cvtph_ps(v.raw)}; +#endif +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec256<float> PromoteTo(D df32, Vec128<bfloat16_t> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PCLMUL_AES) + +HWY_API Vec256<uint8_t> AESRound(Vec256<uint8_t> state, + Vec256<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)}; +#else + const Full256<uint8_t> d; + const Half<decltype(d)> d2; + return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec256<uint8_t> AESLastRound(Vec256<uint8_t> state, + Vec256<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)}; +#else + const Full256<uint8_t> d; + const Half<decltype(d)> d2; + return Combine(d, + AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESLastRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec256<uint8_t> AESRoundInv(Vec256<uint8_t> state, + Vec256<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<uint8_t>{_mm256_aesdec_epi128(state.raw, round_key.raw)}; +#else + const Full256<uint8_t> d; + const Half<decltype(d)> d2; + return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESRoundInv(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec256<uint8_t> AESLastRoundInv(Vec256<uint8_t> state, + Vec256<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<uint8_t>{_mm256_aesdeclast_epi128(state.raw, round_key.raw)}; +#else + const Full256<uint8_t> d; + const Half<decltype(d)> d2; + return Combine( + d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESLastRoundInv(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +template <class V, HWY_IF_V_SIZE_GT_V(V, 16), HWY_IF_U8_D(DFromV<V>)> +HWY_API V AESInvMixColumns(V state) { + const DFromV<decltype(state)> d; +#if HWY_TARGET <= HWY_AVX3_DL + // On AVX3_DL, it is more efficient to do an InvMixColumns operation for a + // 256-bit or 512-bit vector by doing a AESLastRound operation + // (_mm256_aesenclast_epi128/_mm512_aesenclast_epi128) followed by a + // AESRoundInv operation (_mm256_aesdec_epi128/_mm512_aesdec_epi128) than to + // split the vector into 128-bit vectors, carrying out multiple + // _mm_aesimc_si128 operations, and then combining the _mm_aesimc_si128 + // results back into a 256-bit or 512-bit vector. + const auto zero = Zero(d); + return AESRoundInv(AESLastRound(state, zero), zero); +#else + const Half<decltype(d)> dh; + return Combine(d, AESInvMixColumns(UpperHalf(dh, state)), + AESInvMixColumns(LowerHalf(dh, state))); +#endif +} + +template <uint8_t kRcon> +HWY_API Vec256<uint8_t> AESKeyGenAssist(Vec256<uint8_t> v) { + const Full256<uint8_t> d; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(16) static constexpr uint8_t kRconXorMask[16] = { + 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; + alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { + 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; + const Repartition<uint32_t, decltype(d)> du32; + const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); + const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask)); + return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle)); +#else + const Half<decltype(d)> d2; + return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)), + AESKeyGenAssist<kRcon>(LowerHalf(v))); +#endif +} + +HWY_API Vec256<uint64_t> CLMulLower(Vec256<uint64_t> a, Vec256<uint64_t> b) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)}; +#else + const Full256<uint64_t> d; + const Half<decltype(d)> d2; + return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)), + CLMulLower(LowerHalf(a), LowerHalf(b))); +#endif +} + +HWY_API Vec256<uint64_t> CLMulUpper(Vec256<uint64_t> a, Vec256<uint64_t> b) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)}; +#else + const Full256<uint64_t> d; + const Half<decltype(d)> d2; + return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)), + CLMulUpper(LowerHalf(a), LowerHalf(b))); +#endif +} + +#endif // HWY_DISABLE_PCLMUL_AES + +// ================================================== MISC + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ LoadMaskBits + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API Mask256<T> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + constexpr size_t kN = MaxLanes(d); + constexpr size_t kNumBytes = (kN + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes<kNumBytes>(bits, &mask_bits); + + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + + return Mask256<T>::FromBits(mask_bits); +} + +// ------------------------------ StoreMaskBits + +// `p` points to at least 8 writable bytes. +template <class D, typename T = TFromD<D>> +HWY_API size_t StoreMaskBits(D d, Mask256<T> mask, uint8_t* bits) { + constexpr size_t kN = MaxLanes(d); + constexpr size_t kNumBytes = (kN + 7) / 8; + + CopyBytes<kNumBytes>(&mask.raw, bits); + + // Non-full byte, need to clear the undefined upper bits. + if (kN < 8) { + const int mask_bits = static_cast<int>((1ull << kN) - 1); + bits[0] = static_cast<uint8_t>(bits[0] & mask_bits); + } + return kNumBytes; +} + +// ------------------------------ Mask testing + +template <class D, typename T = TFromD<D>> +HWY_API size_t CountTrue(D /* tag */, Mask256<T> mask) { + return PopCount(static_cast<uint64_t>(mask.raw)); +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownFirstTrue(D /* tag */, Mask256<T> mask) { + return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindFirstTrue(D d, Mask256<T> mask) { + return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask)) + : intptr_t{-1}; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownLastTrue(D /* tag */, Mask256<T> mask) { + return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindLastTrue(D d, Mask256<T> mask) { + return mask.raw ? static_cast<intptr_t>(FindKnownLastTrue(d, mask)) + : intptr_t{-1}; +} + +// Beware: the suffix indicates the number of mask bits, not lane size! + +namespace detail { + +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) { + return (uint64_t{mask.raw} & 0xF) == 0; +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API bool AllFalse(D /* tag */, Mask256<T> mask) { + return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask); +} + +namespace detail { + +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFFFFFu; +#endif +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFu; +#endif +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFu; +#endif +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) { + // Cannot use _kortestc because we have less than 8 mask bits. + return mask.raw == 0xFu; +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API bool AllTrue(D /* tag */, const Mask256<T> mask) { + return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask); +} + +// ------------------------------ Compress + +// 16-bit is defined in x86_512 so we can use 512-bit vectors. + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) { + return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; +} + +HWY_API Vec256<float> Compress(Vec256<float> v, Mask256<float> mask) { + return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) { + // See CompressIsPartition. + alignas(16) static constexpr uint64_t packed_array[16] = { + // PrintCompress64x4NibbleTables + 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120, + 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310, + 0x00001032, 0x00001320, 0x00000321, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2) - + // _mm256_permutexvar_epi64 will ignore the upper bits. + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) static constexpr uint64_t shifts[4] = {0, 4, 8, 12}; + const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// ------------------------------ CompressNot (Compress) + +// Implemented in x86_512 for lane size != 8. + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> mask) { + // See CompressIsPartition. + alignas(16) static constexpr uint64_t packed_array[16] = { + // PrintCompressNot64x4NibbleTables + 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031, + 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102, + 0x00003210, 0x00003201, 0x00003210, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2) - + // _mm256_permutexvar_epi64 will ignore the upper bits. + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(32) static constexpr uint64_t shifts[4] = {0, 4, 8, 12}; + const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// ------------------------------ CompressStore (defined in x86_512) +// ------------------------------ CompressBlendedStore (defined in x86_512) +// ------------------------------ CompressBitsStore (defined in x86_512) + +#else // AVX2 + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_V_SIZE. +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { + const Full256<T> d; + const RebindToUnsigned<decltype(d)> du; + const Repartition<uint32_t, decltype(d)> du32; + const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits))); + + // Replicate bytes 8x such that each byte contains the bit that governs it. + const Repartition<uint64_t, decltype(d)> du64; + alignas(32) static constexpr uint64_t kRep8[4] = { + 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull, + 0x0303030303030303ull}; + const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8))); + + alignas(32) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { + const Full256<T> d; + const RebindToUnsigned<decltype(d)> du; + alignas(32) static constexpr uint16_t kBit[16] = { + 1, 2, 4, 8, 16, 32, 64, 128, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { + const Full256<T> d; + const RebindToUnsigned<decltype(d)> du; + alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { + const Full256<T> d; + const RebindToUnsigned<decltype(d)> du; + alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8}; + return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_D(D, 32), typename T = TFromD<D>> +HWY_API Mask256<T> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + constexpr size_t kN = MaxLanes(d); + constexpr size_t kNumBytes = (kN + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes<kNumBytes>(bits, &mask_bits); + + if (kN < 8) { + mask_bits &= (1ull << kN) - 1; + } + + return detail::LoadMaskBits256<T>(mask_bits); +} + +// ------------------------------ StoreMaskBits + +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) { + const Full256<T> d; + const Full256<uint8_t> d8; + const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; + // Prevent sign-extension of 32-bit masks because the intrinsic returns int. + return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits)); +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) { +#if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2) + const Full256<T> d; + const Full256<uint8_t> d8; + const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + const uint64_t sign_bits8 = BitsFromMask(mask8); + // Skip the bits from the lower byte of each u16 (better not to use the + // same packs_epi16 as SSE4, because that requires an extra swizzle here). + return _pext_u32(static_cast<uint32_t>(sign_bits8), 0xAAAAAAAAu); +#else + // Slow workaround for when BMI2 is disabled + // Remove useless lower half of each u16 while preserving the sign bit. + // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes. + const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256()); + // Move odd qwords (value zero) to top so they don't affect the mask value. + const auto compressed = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0))); + return static_cast<unsigned>(_mm_movemask_epi8(compressed)); +#endif // HWY_ARCH_X86_64 +} + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) { + const Full256<T> d; + const Full256<float> df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; + return static_cast<unsigned>(_mm256_movemask_ps(sign_bits)); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) { + const Full256<T> d; + const Full256<double> df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; + return static_cast<unsigned>(_mm256_movemask_pd(sign_bits)); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template <class D, typename T = TFromD<D>> +HWY_API size_t StoreMaskBits(D /* tag */, Mask256<T> mask, uint8_t* bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + const uint64_t mask_bits = detail::BitsFromMask(mask); + CopyBytes<kNumBytes>(&mask_bits, bits); + return kNumBytes; +} + +// ------------------------------ Mask testing + +// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask +// lane is 0 or ~0. +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API bool AllFalse(D d, const Mask256<T> mask) { + const Repartition<uint8_t, decltype(d)> d8; + const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + return detail::BitsFromMask(mask8) == 0; +} + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 2)> +HWY_API bool AllFalse(D /* tag */, const Mask256<T> mask) { + // Cheaper than PTEST, which is 2 uop / 3L. + return detail::BitsFromMask(mask) == 0; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API bool AllTrue(D d, const Mask256<T> mask) { + const Repartition<uint8_t, decltype(d)> d8; + const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + return detail::BitsFromMask(mask8) == (1ull << 32) - 1; +} +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 2)> +HWY_API bool AllTrue(D /* tag */, const Mask256<T> mask) { + constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1; + return detail::BitsFromMask(mask) == kAllBits; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API size_t CountTrue(D d, const Mask256<T> mask) { + const Repartition<uint8_t, decltype(d)> d8; + const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + return PopCount(detail::BitsFromMask(mask8)) >> 1; +} +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 2)> +HWY_API size_t CountTrue(D /* tag */, const Mask256<T> mask) { + return PopCount(detail::BitsFromMask(mask)); +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownFirstTrue(D /* tag */, Mask256<T> mask) { + const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindFirstTrue(D /* tag */, Mask256<T> mask) { + const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t FindKnownLastTrue(D /* tag */, Mask256<T> mask) { + const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindLastTrue(D /* tag */, Mask256<T> mask) { + const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask)); + return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) + : -1; +} + +// ------------------------------ Compress, CompressBits + +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) { + const Full256<uint32_t> d32; + // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT + // of SetTableIndices would require 8 KiB, a large part of L1D. The other + // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) + // and unavailable in 32-bit builds. We instead compress each index into 4 + // bits, for a total of 1 KiB. + alignas(16) static constexpr uint32_t packed_array[256] = { + // PrintCompress32x8Tables + 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8, + 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98, + 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8, + 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98, + 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8, + 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98, + 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8, + 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98, + 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8, + 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98, + 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8, + 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98, + 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8, + 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98, + 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8, + 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98, + 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8, + 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98, + 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8, + 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98, + 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8, + 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98, + 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8, + 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98, + 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8, + 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98, + 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8, + 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98, + 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8, + 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98, + 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8, + 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98, + 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8, + 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98, + 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8, + 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98, + 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8, + 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98, + 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8, + 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98, + 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8, + 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98, + 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98}; + + // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31. + // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. + // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing + // latency, it may be faster to use LoadDup128 and PSHUFB. + const auto packed = Set(d32, packed_array[mask_bits]); + alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, + 16, 20, 24, 28}; + return packed >> Load(d32, shifts); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) { + const Full256<uint32_t> d32; + + // For 64-bit, we still need 32-bit indices because there is no 64-bit + // permutevar, but there are only 4 lanes, so we can afford to skip the + // unpacking and load the entire index vector directly. + alignas(32) static constexpr uint32_t u32_indices[128] = { + // PrintCompress64x4PairTables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, + 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, + 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7, + 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7, + 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5, + 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5, + 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3, + 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15}; + return Load(d32, u32_indices + 8 * mask_bits); +} + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) { + const Full256<uint32_t> d32; + // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT + // of SetTableIndices would require 8 KiB, a large part of L1D. The other + // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) + // and unavailable in 32-bit builds. We instead compress each index into 4 + // bits, for a total of 1 KiB. + alignas(16) static constexpr uint32_t packed_array[256] = { + // PrintCompressNot32x8Tables + 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9, + 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca, + 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9, + 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb, + 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9, + 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba, + 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9, + 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec, + 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9, + 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea, + 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9, + 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb, + 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9, + 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba, + 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9, + 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd, + 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9, + 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca, + 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9, + 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb, + 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9, + 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba, + 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9, + 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc, + 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9, + 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda, + 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9, + 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb, + 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9, + 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba, + 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9, + 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e, + 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9, + 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca, + 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9, + 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db, + 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9, + 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba, + 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9, + 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c, + 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9, + 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a, + 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98}; + + // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31. + // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. + // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing + // latency, it may be faster to use LoadDup128 and PSHUFB. + const Vec256<uint32_t> packed = Set(d32, packed_array[mask_bits]); + alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, + 16, 20, 24, 28}; + return packed >> Load(d32, shifts); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) { + const Full256<uint32_t> d32; + + // For 64-bit, we still need 32-bit indices because there is no 64-bit + // permutevar, but there are only 4 lanes, so we can afford to skip the + // unpacking and load the entire index vector directly. + alignas(32) static constexpr uint32_t u32_indices[128] = { + // PrintCompressNot64x4PairTables + 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, + 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, + 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13, + 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, + 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15, + 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15}; + return Load(d32, u32_indices + 8 * mask_bits); +} + +template <typename T, HWY_IF_NOT_T_SIZE(T, 2)> +HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const Repartition<uint32_t, decltype(d)> du32; + + HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T)))); + // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is + // no instruction for 4x64). + const Indices256<uint32_t> indices{IndicesFromBits256<T>(mask_bits).raw}; + return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); +} + +// LUTs are infeasible for 2^16 possible masks, so splice together two +// half-vector Compress. +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto vu16 = BitCast(du, v); // (required for float16_t inputs) + const Half<decltype(du)> duh; + const auto half0 = LowerHalf(duh, vu16); + const auto half1 = UpperHalf(duh, vu16); + + const uint64_t mask_bits0 = mask_bits & 0xFF; + const uint64_t mask_bits1 = mask_bits >> 8; + const auto compressed0 = detail::CompressBits(half0, mask_bits0); + const auto compressed1 = detail::CompressBits(half1, mask_bits1); + + alignas(32) uint16_t all_true[16] = {}; + // Store mask=true lanes, left to right. + const size_t num_true0 = PopCount(mask_bits0); + Store(compressed0, duh, all_true); + StoreU(compressed1, duh, all_true + num_true0); + + if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) { + // Store mask=false lanes, right to left. The second vector fills the upper + // half with right-aligned false lanes. The first vector is shifted + // rightwards to overwrite the true lanes of the second. + alignas(32) uint16_t all_false[16] = {}; + const size_t num_true1 = PopCount(mask_bits1); + Store(compressed1, duh, all_false + 8); + StoreU(compressed0, duh, all_false + num_true1); + + const auto mask = FirstN(du, num_true0 + num_true1); + return BitCast(d, + IfThenElse(mask, Load(du, all_true), Load(du, all_false))); + } else { + // Only care about the mask=true lanes. + return BitCast(d, Load(du, all_true)); + } +} + +template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> +HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) { + const DFromV<decltype(v)> d; + const Repartition<uint32_t, decltype(d)> du32; + + HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T)))); + // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is + // no instruction for 4x64). + const Indices256<uint32_t> indices{IndicesFromNotBits256<T>(mask_bits).raw}; + return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); +} + +// LUTs are infeasible for 2^16 possible masks, so splice together two +// half-vector Compress. +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) { + // Compress ensures only the lower 16 bits are set, so flip those. + return Compress(v, mask_bits ^ 0xFFFF); +} + +} // namespace detail + +template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) { + return detail::Compress(v, detail::BitsFromMask(m)); +} + +template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) { + return detail::CompressNot(v, detail::BitsFromMask(m)); +} + +HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v, + Mask256<uint64_t> mask) { + return CompressNot(v, mask); +} + +template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes<kNumBytes>(bits, &mask_bits); + + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(v, mask_bits); +} + +// ------------------------------ CompressStore, CompressBitsStore + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, D d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + StoreU(detail::Compress(v, mask_bits), d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +template <class D, typename T = TFromD<D>, + HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> +HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, D d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + + const Repartition<uint32_t, decltype(d)> du32; + HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T)))); + // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is + // no instruction for 4x64). Nibble MSB encodes FirstN. + const Vec256<uint32_t> idx_mask = detail::IndicesFromBits256<T>(mask_bits); + // Shift nibble MSB into MSB + const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_mask)); + // First cast to unsigned (RebindMask cannot change lane size) + const Mask256<MakeUnsigned<T>> mask_u{mask32.raw}; + const Mask256<T> mask = RebindMask(d, mask_u); + const Vec256<T> compressed = BitCast( + d, + TableLookupLanes(BitCast(du32, v), Indices256<uint32_t>{idx_mask.raw})); + + BlendedStore(compressed, mask, d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, D d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const Vec256<T> compressed = detail::Compress(v, mask_bits); + +#if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN + // BlendedStore tests mask for each lane, but we know that the mask is + // FirstN, so we can just copy. + alignas(32) T buf[16]; + Store(compressed, d, buf); + memcpy(unaligned, buf, count * sizeof(T)); +#else + BlendedStore(compressed, FirstN(d, count), d, unaligned); +#endif + return count; +} + +template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits, + Full256<T> d, T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes<kNumBytes>(bits, &mask_bits); + + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + const size_t count = PopCount(mask_bits); + + StoreU(detail::Compress(v, mask_bits), d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Expand + +// Always define Expand/LoadExpand because generic_ops only does so for Vec128. + +namespace detail { + +#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 + +HWY_INLINE Vec256<uint8_t> NativeExpand(Vec256<uint8_t> v, + Mask256<uint8_t> mask) { + return Vec256<uint8_t>{_mm256_maskz_expand_epi8(mask.raw, v.raw)}; +} + +HWY_INLINE Vec256<uint16_t> NativeExpand(Vec256<uint16_t> v, + Mask256<uint16_t> mask) { + return Vec256<uint16_t>{_mm256_maskz_expand_epi16(mask.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)> +HWY_INLINE Vec256<uint8_t> NativeLoadExpand( + Mask256<uint8_t> mask, D /* d */, const uint8_t* HWY_RESTRICT unaligned) { + return Vec256<uint8_t>{_mm256_maskz_expandloadu_epi8(mask.raw, unaligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)> +HWY_INLINE Vec256<uint16_t> NativeLoadExpand( + Mask256<uint16_t> mask, D /* d */, const uint16_t* HWY_RESTRICT unaligned) { + return Vec256<uint16_t>{_mm256_maskz_expandloadu_epi16(mask.raw, unaligned)}; +} + +#endif // HWY_TARGET <= HWY_AVX3_DL +#if HWY_TARGET <= HWY_AVX3 || HWY_IDE + +HWY_INLINE Vec256<uint32_t> NativeExpand(Vec256<uint32_t> v, + Mask256<uint32_t> mask) { + return Vec256<uint32_t>{_mm256_maskz_expand_epi32(mask.raw, v.raw)}; +} + +HWY_INLINE Vec256<uint64_t> NativeExpand(Vec256<uint64_t> v, + Mask256<uint64_t> mask) { + return Vec256<uint64_t>{_mm256_maskz_expand_epi64(mask.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> +HWY_INLINE Vec256<uint32_t> NativeLoadExpand( + Mask256<uint32_t> mask, D /* d */, const uint32_t* HWY_RESTRICT unaligned) { + return Vec256<uint32_t>{_mm256_maskz_expandloadu_epi32(mask.raw, unaligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)> +HWY_INLINE Vec256<uint64_t> NativeLoadExpand( + Mask256<uint64_t> mask, D /* d */, const uint64_t* HWY_RESTRICT unaligned) { + return Vec256<uint64_t>{_mm256_maskz_expandloadu_epi64(mask.raw, unaligned)}; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +} // namespace detail + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { + const DFromV<decltype(v)> d; +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + const RebindToUnsigned<decltype(d)> du; + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +#else + // LUTs are infeasible for so many mask combinations, so Combine two + // half-vector Expand. + const Half<decltype(d)> dh; + const uint64_t mask_bits = detail::BitsFromMask(mask); + constexpr size_t N = 32 / sizeof(T); + const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); + const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); + const Vec128<T> expandL = Expand(LowerHalf(v), maskL); + // We have to shift the input by a variable number of bytes, but there isn't + // a table-driven option for that until VBMI, and CPUs with that likely also + // have VBMI2 and thus native Expand. + alignas(32) T lanes[N]; + Store(v, d, lanes); + const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); + const Vec128<T> expandH = Expand(LoadU(dh, lanes + countL), maskH); + return Combine(d, expandH, expandL); +#endif +} + +// If AVX3, this is already implemented by x86_512. +#if HWY_TARGET != HWY_AVX3 + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { + const Full256<T> d; +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + const RebindToUnsigned<decltype(d)> du; + return BitCast(d, detail::NativeExpand(BitCast(du, v), RebindMask(du, mask))); +#else // AVX2 + // LUTs are infeasible for 2^16 possible masks, so splice together two + // half-vector Expand. + const Half<decltype(d)> dh; + const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); + const Vec128<T> expandL = Expand(LowerHalf(v), maskL); + // We have to shift the input by a variable number of u16. permutevar_epi16 + // requires AVX3 and if we had that, we'd use native u32 Expand. The only + // alternative is re-loading, which incurs a store to load forwarding stall. + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, d, lanes); + const Vec128<T> vH = LoadU(dh, lanes + CountTrue(dh, maskL)); + const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); + const Vec128<T> expandH = Expand(vH, maskH); + return Combine(d, expandH, expandL); +#endif // AVX2 +} + +#endif // HWY_TARGET != HWY_AVX3 + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { + const Full256<T> d; +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +#else + const RebindToUnsigned<decltype(d)> du; + const uint64_t mask_bits = detail::BitsFromMask(mask); + + alignas(16) constexpr uint32_t packed_array[256] = { + // PrintExpand32x8Nibble. + 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0, + 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10, + 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0, + 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210, + 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0, + 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10, + 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0, + 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210, + 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0, + 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10, + 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0, + 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210, + 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0, + 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10, + 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0, + 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210, + 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0, + 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10, + 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0, + 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210, + 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0, + 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10, + 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0, + 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210, + 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0, + 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10, + 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0, + 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210, + 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0, + 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10, + 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0, + 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210, + 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0, + 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10, + 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0, + 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210, + 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0, + 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10, + 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0, + 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210, + 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0, + 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10, + 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210, + }; + + // For lane i, shift the i-th 4-bit index down to bits [0, 3). + const Vec256<uint32_t> packed = Set(du, packed_array[mask_bits]); + alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; + // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. + const Indices256<uint32_t> indices{(packed >> Load(du, shifts)).raw}; + const Vec256<uint32_t> expand = TableLookupLanes(BitCast(du, v), indices); + // TableLookupLanes cannot also zero masked-off lanes, so do that now. + return IfThenElseZero(mask, BitCast(d, expand)); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { + const Full256<T> d; +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +#else + const RebindToUnsigned<decltype(d)> du; + const uint64_t mask_bits = detail::BitsFromMask(mask); + + alignas(16) constexpr uint64_t packed_array[16] = { + // PrintExpand64x4Nibble. + 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, + 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, + 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2). + const Vec256<uint64_t> packed = Set(du, packed_array[mask_bits]); + alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; +#if HWY_TARGET <= HWY_AVX3 // native 64-bit TableLookupLanes + // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. + const Indices256<uint64_t> indices{(packed >> Load(du, shifts)).raw}; +#else + // 64-bit TableLookupLanes on AVX2 requires IndicesFromVec, which checks + // bounds, so clear the upper bits. + const Vec256<uint64_t> masked = And(packed >> Load(du, shifts), Set(du, 3)); + const Indices256<uint64_t> indices = IndicesFromVec(du, masked); +#endif + const Vec256<uint64_t> expand = TableLookupLanes(BitCast(du, v), indices); + // TableLookupLanes cannot also zero masked-off lanes, so do that now. + return IfThenElseZero(mask, BitCast(d, expand)); +#endif +} + +// ------------------------------ LoadExpand + +template <class D, HWY_IF_V_SIZE_D(D, 32), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); +#else + return Expand(LoadU(d, unaligned), mask); +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 32), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); +#else + return Expand(LoadU(d, unaligned), mask); +#endif +} + +// ------------------------------ LoadInterleaved3/4 + +// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. + +namespace detail { +// Input: +// 1 0 (<- first block of unaligned) +// 3 2 +// 5 4 +// Output: +// 3 0 +// 4 1 +// 5 2 +template <class D, typename T = TFromD<D>> +HWY_API void LoadTransposedBlocks3(D d, const T* HWY_RESTRICT unaligned, + Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) { + constexpr size_t N = 32 / sizeof(T); + const Vec256<T> v10 = LoadU(d, unaligned + 0 * N); // 1 0 + const Vec256<T> v32 = LoadU(d, unaligned + 1 * N); + const Vec256<T> v54 = LoadU(d, unaligned + 2 * N); + + A = ConcatUpperLower(d, v32, v10); + B = ConcatLowerUpper(d, v54, v10); + C = ConcatUpperLower(d, v54, v32); +} + +// Input (128-bit blocks): +// 1 0 (first block of unaligned) +// 3 2 +// 5 4 +// 7 6 +// Output: +// 4 0 (LSB of vA) +// 5 1 +// 6 2 +// 7 3 +template <class D, typename T = TFromD<D>> +HWY_API void LoadTransposedBlocks4(D d, const T* HWY_RESTRICT unaligned, + Vec256<T>& vA, Vec256<T>& vB, Vec256<T>& vC, + Vec256<T>& vD) { + constexpr size_t N = 32 / sizeof(T); + const Vec256<T> v10 = LoadU(d, unaligned + 0 * N); + const Vec256<T> v32 = LoadU(d, unaligned + 1 * N); + const Vec256<T> v54 = LoadU(d, unaligned + 2 * N); + const Vec256<T> v76 = LoadU(d, unaligned + 3 * N); + + vA = ConcatLowerLower(d, v54, v10); + vB = ConcatUpperUpper(d, v54, v10); + vC = ConcatLowerLower(d, v76, v32); + vD = ConcatUpperUpper(d, v76, v32); +} +} // namespace detail + +// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) + +// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. + +namespace detail { +// Input (128-bit blocks): +// 2 0 (LSB of i) +// 3 1 +// Output: +// 1 0 +// 3 2 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks2(Vec256<T> i, Vec256<T> j, D d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + const auto out0 = ConcatLowerLower(d, j, i); + const auto out1 = ConcatUpperUpper(d, j, i); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); +} + +// Input (128-bit blocks): +// 3 0 (LSB of i) +// 4 1 +// 5 2 +// Output: +// 1 0 +// 3 2 +// 5 4 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks3(Vec256<T> i, Vec256<T> j, Vec256<T> k, D d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + const auto out0 = ConcatLowerLower(d, j, i); + const auto out1 = ConcatUpperLower(d, i, k); + const auto out2 = ConcatUpperUpper(d, k, j); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + StoreU(out2, d, unaligned + 2 * N); +} + +// Input (128-bit blocks): +// 4 0 (LSB of i) +// 5 1 +// 6 2 +// 7 3 +// Output: +// 1 0 +// 3 2 +// 5 4 +// 7 6 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks4(Vec256<T> i, Vec256<T> j, Vec256<T> k, + Vec256<T> l, D d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + // Write lower halves, then upper. + const auto out0 = ConcatLowerLower(d, j, i); + const auto out1 = ConcatLowerLower(d, l, k); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + const auto out2 = ConcatUpperUpper(d, j, i); + const auto out3 = ConcatUpperUpper(d, l, k); + StoreU(out2, d, unaligned + 2 * N); + StoreU(out3, d, unaligned + 3 * N); +} +} // namespace detail + +// ------------------------------ Reductions + +namespace detail { + +// Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block. +// Same logic as x86/128.h, but with Vec256 arguments. +template <typename T> +HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256<T> v3210) { + const auto v1032 = Shuffle1032(v3210); + const auto v31_20_31_20 = v3210 + v1032; + const auto v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template <typename T> +HWY_INLINE T ReduceSum(hwy::SizeTag<4> /* tag */, + const Vec256<T> v3210) { + return GetLane(SumOfLanes(hwy::SizeTag<4>(), v3210)); +} +template <typename T> +HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256<T> v3210) { + const auto v1032 = Shuffle1032(v3210); + const auto v31_20_31_20 = Min(v3210, v1032); + const auto v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template <typename T> +HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256<T> v3210) { + const auto v1032 = Shuffle1032(v3210); + const auto v31_20_31_20 = Max(v3210, v1032); + const auto v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +template <typename T> +HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256<T> v10) { + const auto v01 = Shuffle01(v10); + return v10 + v01; +} +template <typename T> +HWY_INLINE T ReduceSum(hwy::SizeTag<8> /* tag */, + const Vec256<T> v10) { + return GetLane(SumOfLanes(hwy::SizeTag<8>(), v10)); +} +template <typename T> +HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256<T> v10) { + const auto v01 = Shuffle01(v10); + return Min(v10, v01); +} +template <typename T> +HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256<T> v10) { + const auto v01 = Shuffle01(v10); + return Max(v10, v01); +} + +HWY_API uint16_t ReduceSum(hwy::SizeTag<2> /* tag */, + Vec256<uint16_t> v) { + const Full256<uint16_t> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = ReduceSum(hwy::SizeTag<4>(), even + odd); + return static_cast<uint16_t>(sum); +} + +HWY_API Vec256<uint16_t> SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec256<uint16_t> v) { + const DFromV<decltype(v)> d; + return Set(d, ReduceSum(hwy::SizeTag<2>(), v)); +} + +HWY_API int16_t ReduceSum(hwy::SizeTag<2> /* tag */, + Vec256<int16_t> v) { + const Full256<int16_t> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = ReduceSum(hwy::SizeTag<4>(), even + odd); + return static_cast<int16_t>(sum); +} + +HWY_API Vec256<int16_t> SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec256<int16_t> v) { + const DFromV<decltype(v)> d; + return Set(d, ReduceSum(hwy::SizeTag<2>(), v)); +} + +HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec256<uint16_t> v) { + const Full256<uint16_t> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +HWY_API Vec256<int16_t> MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec256<int16_t> v) { + const Full256<int16_t> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +HWY_API Vec256<uint16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec256<uint16_t> v) { + const Full256<uint16_t> d; + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec256<int16_t> v) { + const Full256<int16_t> d; + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +} // namespace detail + +// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result. +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> SumOfLanes(D /*d*/, const Vec256<T> vHL) { + const Vec256<T> vLH = SwapAdjacentBlocks(vHL); + return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL); +} +template <class D, typename T = TFromD<D>> +HWY_API T ReduceSum(D /*d*/, const Vec256<T> vHL) { + const Vec256<T> vLH = SwapAdjacentBlocks(vHL); + return detail::ReduceSum(hwy::SizeTag<sizeof(T)>(), vLH + vHL); +} +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> MinOfLanes(D /*d*/, const Vec256<T> vHL) { + const Vec256<T> vLH = SwapAdjacentBlocks(vHL); + return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL)); +} +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> MaxOfLanes(D /*d*/, const Vec256<T> vHL) { + const Vec256<T> vLH = SwapAdjacentBlocks(vHL); + return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL)); +} + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +#if HWY_TARGET <= HWY_AVX3 +template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_D(DFromV<V>, 32)> +HWY_API V LeadingZeroCount(V v) { + return V{_mm256_lzcnt_epi32(v.raw)}; +} + +template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_D(DFromV<V>, 32)> +HWY_API V LeadingZeroCount(V v) { + return V{_mm256_lzcnt_epi64(v.raw)}; +} +#endif // HWY_TARGET <= HWY_AVX3 + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) diff --git a/third_party/highway/hwy/ops/x86_512-inl.h b/third_party/highway/hwy/ops/x86_512-inl.h new file mode 100644 index 0000000000..3f62b12754 --- /dev/null +++ b/third_party/highway/hwy/ops/x86_512-inl.h @@ -0,0 +1,5733 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 512-bit AVX512 vectors and operations. +// External include guard in highway.h - see comment there. + +// WARNING: most operations do not cross 128-bit block boundaries. In +// particular, "Broadcast", pack and zip behavior may be surprising. + +// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL +#include "hwy/base.h" + +// Avoid uninitialized warnings in GCC's avx512fintrin.h - see +// https://github.com/google/highway/issues/710) +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, + ignored "-Wmaybe-uninitialized") +#endif + +#include <immintrin.h> // AVX2+ + +#if HWY_COMPILER_CLANGCL +// Including <immintrin.h> should be enough, but Clang's headers helpfully skip +// including these headers when _MSC_VER is defined, like when using clang-cl. +// Include these directly here. +// clang-format off +#include <smmintrin.h> + +#include <avxintrin.h> +// avxintrin defines __m256i and must come before avx2intrin. +#include <avx2intrin.h> +#include <f16cintrin.h> +#include <fmaintrin.h> + +#include <avx512fintrin.h> +#include <avx512vlintrin.h> +#include <avx512bwintrin.h> +#include <avx512vlbwintrin.h> +#include <avx512dqintrin.h> +#include <avx512vldqintrin.h> +#include <avx512cdintrin.h> +#include <avx512vlcdintrin.h> + +#if HWY_TARGET <= HWY_AVX3_DL +#include <avx512bitalgintrin.h> +#include <avx512vlbitalgintrin.h> +#include <avx512vbmiintrin.h> +#include <avx512vbmivlintrin.h> +#include <avx512vbmi2intrin.h> +#include <avx512vlvbmi2intrin.h> +#include <avx512vpopcntdqintrin.h> +#include <avx512vpopcntdqvlintrin.h> +#include <avx512vnniintrin.h> +#include <avx512vlvnniintrin.h> +// Must come after avx512fintrin, else will not define 512-bit intrinsics. +#include <vaesintrin.h> +#include <vpclmulqdqintrin.h> +#include <gfniintrin.h> +#endif // HWY_TARGET <= HWY_AVX3_DL +// clang-format on +#endif // HWY_COMPILER_CLANGCL + +// For half-width vectors. Already includes base.h and shared-inl.h. +#include "hwy/ops/x86_256-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { + +template <typename T> +struct Raw512 { + using type = __m512i; +}; +template <> +struct Raw512<float> { + using type = __m512; +}; +template <> +struct Raw512<double> { + using type = __m512d; +}; + +// Template arg: sizeof(lane type) +template <size_t size> +struct RawMask512 {}; +template <> +struct RawMask512<1> { + using type = __mmask64; +}; +template <> +struct RawMask512<2> { + using type = __mmask32; +}; +template <> +struct RawMask512<4> { + using type = __mmask16; +}; +template <> +struct RawMask512<8> { + using type = __mmask8; +}; + +} // namespace detail + +template <typename T> +class Vec512 { + using Raw = typename detail::Raw512<T>::type; + + public: + using PrivateT = T; // only for DFromV + static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromV + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec512& operator*=(const Vec512 other) { + return *this = (*this * other); + } + HWY_INLINE Vec512& operator/=(const Vec512 other) { + return *this = (*this / other); + } + HWY_INLINE Vec512& operator+=(const Vec512 other) { + return *this = (*this + other); + } + HWY_INLINE Vec512& operator-=(const Vec512 other) { + return *this = (*this - other); + } + HWY_INLINE Vec512& operator&=(const Vec512 other) { + return *this = (*this & other); + } + HWY_INLINE Vec512& operator|=(const Vec512 other) { + return *this = (*this | other); + } + HWY_INLINE Vec512& operator^=(const Vec512 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +// Mask register: one bit per lane. +template <typename T> +struct Mask512 { + using Raw = typename detail::RawMask512<sizeof(T)>::type; + Raw raw; +}; + +template <typename T> +using Full512 = Simd<T, 64 / sizeof(T), 0>; + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; } +HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); } +HWY_INLINE __m512i BitCastToInteger(__m512d v) { + return _mm512_castpd_si512(v); +} + +template <typename T> +HWY_INLINE Vec512<uint8_t> BitCastToByte(Vec512<T> v) { + return Vec512<uint8_t>{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template <typename T> +struct BitCastFromInteger512 { + HWY_INLINE __m512i operator()(__m512i v) { return v; } +}; +template <> +struct BitCastFromInteger512<float> { + HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); } +}; +template <> +struct BitCastFromInteger512<double> { + HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); } +}; + +template <class D, typename T = TFromD<D>> +HWY_INLINE Vec512<T> BitCastFromByte(D /* tag */, Vec512<uint8_t> v) { + return Vec512<T>{BitCastFromInteger512<T>()(v.raw)}; +} + +} // namespace detail + +template <class D, typename T = TFromD<D>, typename FromT> +HWY_API Vec512<T> BitCast(D d, Vec512<FromT> v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Set + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm512_set1_epi32(static_cast<int>(t))}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)> +HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { + return VFromD<D>{_mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)> +HWY_API Vec512<float> Set(D /* tag */, float t) { + return Vec512<float>{_mm512_set1_ps(t)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)> +HWY_API Vec512<double> Set(D /* tag */, double t) { + return Vec512<double>{_mm512_set1_pd(t)}; +} + +// ------------------------------ Zero (Set) + +// GCC pre-9.1 lacked setzero, so use Set instead. +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<TFromD<D>> Zero(D d) { + return Set(d, TFromD<D>{0}); +} + +#else + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_D(D)> +HWY_API Vec512<TFromD<D>> Zero(D /* tag */) { + return Vec512<TFromD<D>>{_mm512_setzero_si512()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)> +HWY_API Vec512<float> Zero(D /* tag */) { + return Vec512<float>{_mm512_setzero_ps()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)> +HWY_API Vec512<double> Zero(D /* tag */) { + return Vec512<double>{_mm512_setzero_pd()}; +} + +#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 + +// ------------------------------ Undefined + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_D(D)> +HWY_API Vec512<TFromD<D>> Undefined(D /* tag */) { + // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC + // generate an XOR instruction. + return Vec512<TFromD<D>>{_mm512_undefined_epi32()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)> +HWY_API Vec512<float> Undefined(D /* tag */) { + return Vec512<float>{_mm512_undefined_ps()}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)> +HWY_API Vec512<double> Undefined(D /* tag */) { + return Vec512<double>{_mm512_undefined_pd()}; +} + +HWY_DIAGNOSTICS(pop) + +// ------------------------------ ResizeBitCast + +// 64-byte vector to 16-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 64), + HWY_IF_V_SIZE_D(D, 16)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, Vec128<uint8_t>{_mm512_castsi512_si128( + BitCast(Full512<uint8_t>(), v).raw)}); +} + +// <= 16-byte vector to 64-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), + HWY_IF_V_SIZE_D(D, 64)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, Vec512<uint8_t>{_mm512_castsi128_si512( + ResizeBitCast(Full128<uint8_t>(), v).raw)}); +} + +// 32-byte vector to 64-byte vector +template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 32), + HWY_IF_V_SIZE_D(D, 64)> +HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { + return BitCast(d, Vec512<uint8_t>{_mm512_castsi256_si512( + BitCast(Full256<uint8_t>(), v).raw)}); +} + +// ----------------------------- Iota + +namespace detail { + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)> +HWY_INLINE VFromD<D> Iota0(D d) { +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 + // Missing set_epi8/16. + alignas(64) static constexpr TFromD<D> kIota[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; + return Load(d, kIota); +#else + (void)d; + return VFromD<D>{_mm512_set_epi8( + static_cast<char>(63), static_cast<char>(62), static_cast<char>(61), + static_cast<char>(60), static_cast<char>(59), static_cast<char>(58), + static_cast<char>(57), static_cast<char>(56), static_cast<char>(55), + static_cast<char>(54), static_cast<char>(53), static_cast<char>(52), + static_cast<char>(51), static_cast<char>(50), static_cast<char>(49), + static_cast<char>(48), static_cast<char>(47), static_cast<char>(46), + static_cast<char>(45), static_cast<char>(44), static_cast<char>(43), + static_cast<char>(42), static_cast<char>(41), static_cast<char>(40), + static_cast<char>(39), static_cast<char>(38), static_cast<char>(37), + static_cast<char>(36), static_cast<char>(35), static_cast<char>(34), + static_cast<char>(33), static_cast<char>(32), static_cast<char>(31), + static_cast<char>(30), static_cast<char>(29), static_cast<char>(28), + static_cast<char>(27), static_cast<char>(26), static_cast<char>(25), + static_cast<char>(24), static_cast<char>(23), static_cast<char>(22), + static_cast<char>(21), static_cast<char>(20), static_cast<char>(19), + static_cast<char>(18), static_cast<char>(17), static_cast<char>(16), + static_cast<char>(15), static_cast<char>(14), static_cast<char>(13), + static_cast<char>(12), static_cast<char>(11), static_cast<char>(10), + static_cast<char>(9), static_cast<char>(8), static_cast<char>(7), + static_cast<char>(6), static_cast<char>(5), static_cast<char>(4), + static_cast<char>(3), static_cast<char>(2), static_cast<char>(1), + static_cast<char>(0))}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2), + HWY_IF_NOT_SPECIAL_FLOAT_D(D)> +HWY_INLINE VFromD<D> Iota0(D d) { +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 + // Missing set_epi8/16. + alignas(64) static constexpr TFromD<D> kIota[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + return Load(d, kIota); +#else + (void)d; + return VFromD<D>{_mm512_set_epi16( + int16_t{31}, int16_t{30}, int16_t{29}, int16_t{28}, int16_t{27}, + int16_t{26}, int16_t{25}, int16_t{24}, int16_t{23}, int16_t{22}, + int16_t{21}, int16_t{20}, int16_t{19}, int16_t{18}, int16_t{17}, + int16_t{16}, int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, + int16_t{11}, int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, + int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})}; +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm512_set_epi32( + int32_t{15}, int32_t{14}, int32_t{13}, int32_t{12}, int32_t{11}, + int32_t{10}, int32_t{9}, int32_t{8}, int32_t{7}, int32_t{6}, int32_t{5}, + int32_t{4}, int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm512_set_epi64(int64_t{7}, int64_t{6}, int64_t{5}, + int64_t{4}, int64_t{3}, int64_t{2}, + int64_t{1}, int64_t{0})}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, + 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, + 0.0f)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)> +HWY_INLINE VFromD<D> Iota0(D /*d*/) { + return VFromD<D>{_mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)}; +} + +} // namespace detail + +template <class D, typename T2, HWY_IF_V_SIZE_D(D, 64)> +HWY_API VFromD<D> Iota(D d, const T2 first) { + return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); +} + +// ================================================== LOGICAL + +// ------------------------------ Not + +template <typename T> +HWY_API Vec512<T> Not(const Vec512<T> v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m512i vu = BitCast(du, v).raw; + return BitCast(d, VU{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)}); +} + +// ------------------------------ And + +template <typename T> +HWY_API Vec512<T> And(const Vec512<T> a, const Vec512<T> b) { + return Vec512<T>{_mm512_and_si512(a.raw, b.raw)}; +} + +HWY_API Vec512<float> And(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_and_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> And(const Vec512<double> a, const Vec512<double> b) { + return Vec512<double>{_mm512_and_pd(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template <typename T> +HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) { + return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)}; +} +HWY_API Vec512<float> AndNot(const Vec512<float> not_mask, + const Vec512<float> mask) { + return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)}; +} +HWY_API Vec512<double> AndNot(const Vec512<double> not_mask, + const Vec512<double> mask) { + return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)}; +} + +// ------------------------------ Or + +template <typename T> +HWY_API Vec512<T> Or(const Vec512<T> a, const Vec512<T> b) { + return Vec512<T>{_mm512_or_si512(a.raw, b.raw)}; +} + +HWY_API Vec512<float> Or(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_or_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> Or(const Vec512<double> a, const Vec512<double> b) { + return Vec512<double>{_mm512_or_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template <typename T> +HWY_API Vec512<T> Xor(const Vec512<T> a, const Vec512<T> b) { + return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)}; +} + +HWY_API Vec512<float> Xor(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) { + return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor3 +template <typename T> +HWY_API Vec512<T> Xor3(Vec512<T> x1, Vec512<T> x2, Vec512<T> x3) { + const DFromV<decltype(x1)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m512i ret = _mm512_ternarylogic_epi64( + BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); + return BitCast(d, VU{ret}); +} + +// ------------------------------ Or3 +template <typename T> +HWY_API Vec512<T> Or3(Vec512<T> o1, Vec512<T> o2, Vec512<T> o3) { + const DFromV<decltype(o1)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m512i ret = _mm512_ternarylogic_epi64( + BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); + return BitCast(d, VU{ret}); +} + +// ------------------------------ OrAnd +template <typename T> +HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) { + const DFromV<decltype(o)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + const __m512i ret = _mm512_ternarylogic_epi64( + BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); + return BitCast(d, VU{ret}); +} + +// ------------------------------ IfVecThenElse +template <typename T> +HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) { + const DFromV<decltype(yes)> d; + const RebindToUnsigned<decltype(d)> du; + using VU = VFromD<decltype(du)>; + return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw, + BitCast(du, yes).raw, + BitCast(du, no).raw, 0xCA)}); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template <typename T> +HWY_API Vec512<T> operator&(const Vec512<T> a, const Vec512<T> b) { + return And(a, b); +} + +template <typename T> +HWY_API Vec512<T> operator|(const Vec512<T> a, const Vec512<T> b) { + return Or(a, b); +} + +template <typename T> +HWY_API Vec512<T> operator^(const Vec512<T> a, const Vec512<T> b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +// 8/16 require BITALG, 32/64 require VPOPCNTDQ. +#if HWY_TARGET <= HWY_AVX3_DL + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template <typename T> +HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec512<T> v) { + return Vec512<T>{_mm512_popcnt_epi8(v.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec512<T> v) { + return Vec512<T>{_mm512_popcnt_epi16(v.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec512<T> v) { + return Vec512<T>{_mm512_popcnt_epi32(v.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec512<T> v) { + return Vec512<T>{_mm512_popcnt_epi64(v.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec512<T> PopulationCount(Vec512<T> v) { + return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ================================================== SIGN + +// ------------------------------ CopySign + +template <typename T> +HWY_API Vec512<T> CopySign(const Vec512<T> magn, const Vec512<T> sign) { + static_assert(IsFloat<T>(), "Only makes sense for floating-point"); + + const DFromV<decltype(magn)> d; + const auto msb = SignBit(d); + + const RebindToUnsigned<decltype(d)> du; + // Truth table for msb, magn, sign | bitwise msb ? sign : mag + // 0 0 0 | 0 + // 0 0 1 | 0 + // 0 1 0 | 1 + // 0 1 1 | 1 + // 1 0 0 | 0 + // 1 0 1 | 1 + // 1 1 0 | 0 + // 1 1 1 | 1 + // The lane size does not matter because we are not using predication. + const __m512i out = _mm512_ternarylogic_epi32( + BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); + return BitCast(d, decltype(Zero(du)){out}); +} + +template <typename T> +HWY_API Vec512<T> CopySignToAbs(const Vec512<T> abs, const Vec512<T> sign) { + // AVX3 can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +} + +// ================================================== MASK + +// ------------------------------ FirstN + +// Possibilities for constructing a bitmask of N ones: +// - kshift* only consider the lowest byte of the shift count, so they would +// not correctly handle large n. +// - Scalar shifts >= 64 are UB. +// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However, +// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds. + +#if HWY_ARCH_X86_32 +namespace detail { + +// 32 bit mask is sufficient for lane size >= 2. +template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_INLINE Mask512<T> FirstN(size_t n) { + Mask512<T> m; + const uint32_t all = ~uint32_t{0}; + // BZHI only looks at the lower 8 bits of n! + m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n)); + return m; +} + +#if HWY_COMPILER_MSVC >= 1920 || HWY_COMPILER_GCC_ACTUAL >= 900 || \ + HWY_COMPILER_CLANG || HWY_COMPILER_ICC +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Mask512<T> FirstN(size_t n) { + uint32_t lo_mask; + uint32_t hi_mask; + uint32_t hi_mask_len; +#if HWY_COMPILER_GCC + if (__builtin_constant_p(n >= 32) && n >= 32) { + if (__builtin_constant_p(n >= 64) && n >= 64) { + hi_mask_len = 32u; + } else { + hi_mask_len = ((n <= 287) ? static_cast<uint32_t>(n) : 287u) - 32u; + } + lo_mask = hi_mask = 0xFFFFFFFFu; + } else // NOLINT(readability/braces) +#endif + { + const uint32_t lo_mask_len = (n <= 255) ? static_cast<uint32_t>(n) : 255u; + lo_mask = _bzhi_u32(0xFFFFFFFFu, lo_mask_len); + +#if HWY_COMPILER_GCC + if (__builtin_constant_p(lo_mask_len <= 32) && lo_mask_len <= 32) { + return Mask512<T>{static_cast<__mmask64>(lo_mask)}; + } +#endif + + _addcarry_u32(_subborrow_u32(0, lo_mask_len, 32u, &hi_mask_len), + 0xFFFFFFFFu, 0u, &hi_mask); + } + hi_mask = _bzhi_u32(hi_mask, hi_mask_len); +#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC + if (__builtin_constant_p((static_cast<uint64_t>(hi_mask) << 32) | lo_mask)) +#endif + return Mask512<T>{static_cast<__mmask64>( + (static_cast<uint64_t>(hi_mask) << 32) | lo_mask)}; +#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC + else + return Mask512<T>{_mm512_kunpackd(static_cast<__mmask64>(hi_mask), + static_cast<__mmask64>(lo_mask))}; +#endif +} +#else +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_INLINE Mask512<T> FirstN(size_t n) { + const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0}; + return Mask512<T>{static_cast<__mmask64>(bits)}; +} +#endif +} // namespace detail +#endif // HWY_ARCH_X86_32 + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API Mask512<T> FirstN(D /* tag */, size_t n) { +#if HWY_ARCH_X86_64 + Mask512<T> m; + const uint64_t all = ~uint64_t{0}; + // BZHI only looks at the lower 8 bits of n! + m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n)); + return m; +#else + return detail::FirstN<T>(n); +#endif // HWY_ARCH_X86_64 +} + +// ------------------------------ IfThenElse + +// Returns mask ? b : a. + +namespace detail { + +// Templates for signed/unsigned integer of a particular size. +template <typename T> +HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<1> /* tag */, + const Mask512<T> mask, const Vec512<T> yes, + const Vec512<T> no) { + return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<2> /* tag */, + const Mask512<T> mask, const Vec512<T> yes, + const Vec512<T> no) { + return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<4> /* tag */, + const Mask512<T> mask, const Vec512<T> yes, + const Vec512<T> no) { + return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<8> /* tag */, + const Mask512<T> mask, const Vec512<T> yes, + const Vec512<T> no) { + return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec512<T> IfThenElse(const Mask512<T> mask, const Vec512<T> yes, + const Vec512<T> no) { + return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no); +} +HWY_API Vec512<float> IfThenElse(const Mask512<float> mask, + const Vec512<float> yes, + const Vec512<float> no) { + return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)}; +} +HWY_API Vec512<double> IfThenElse(const Mask512<double> mask, + const Vec512<double> yes, + const Vec512<double> no) { + return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)}; +} + +namespace detail { + +template <typename T> +HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<1> /* tag */, + const Mask512<T> mask, + const Vec512<T> yes) { + return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<2> /* tag */, + const Mask512<T> mask, + const Vec512<T> yes) { + return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<4> /* tag */, + const Mask512<T> mask, + const Vec512<T> yes) { + return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, + const Mask512<T> mask, + const Vec512<T> yes) { + return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec512<T> IfThenElseZero(const Mask512<T> mask, const Vec512<T> yes) { + return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes); +} +HWY_API Vec512<float> IfThenElseZero(const Mask512<float> mask, + const Vec512<float> yes) { + return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)}; +} +HWY_API Vec512<double> IfThenElseZero(const Mask512<double> mask, + const Vec512<double> yes) { + return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)}; +} + +namespace detail { + +template <typename T> +HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<1> /* tag */, + const Mask512<T> mask, const Vec512<T> no) { + // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. + return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<2> /* tag */, + const Mask512<T> mask, const Vec512<T> no) { + return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<4> /* tag */, + const Mask512<T> mask, const Vec512<T> no) { + return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; +} +template <typename T> +HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, + const Mask512<T> mask, const Vec512<T> no) { + return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Vec512<T> IfThenZeroElse(const Mask512<T> mask, const Vec512<T> no) { + return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no); +} +HWY_API Vec512<float> IfThenZeroElse(const Mask512<float> mask, + const Vec512<float> no) { + return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; +} +HWY_API Vec512<double> IfThenZeroElse(const Mask512<double> mask, + const Vec512<double> no) { + return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; +} + +template <typename T> +HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) { + static_assert(IsSigned<T>(), "Only works for signed/float"); + // AVX3 MaskFromVec only looks at the MSB + return IfThenElse(MaskFromVec(v), yes, no); +} + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) { + // AVX3 MaskFromVec only looks at the MSB + return IfThenZeroElse(MaskFromVec(v), v); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +HWY_API Vec512<uint8_t> operator+(const Vec512<uint8_t> a, + const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> operator+(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> operator+(const Vec512<uint32_t> a, + const Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> operator+(const Vec512<uint64_t> a, + const Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int8_t> operator+(const Vec512<int8_t> a, + const Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> operator+(const Vec512<int16_t> a, + const Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> operator+(const Vec512<int32_t> a, + const Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> operator+(const Vec512<int64_t> a, + const Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512<float> operator+(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_add_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> operator+(const Vec512<double> a, + const Vec512<double> b) { + return Vec512<double>{_mm512_add_pd(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +HWY_API Vec512<uint8_t> operator-(const Vec512<uint8_t> a, + const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> operator-(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> operator-(const Vec512<uint32_t> a, + const Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> operator-(const Vec512<uint64_t> a, + const Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int8_t> operator-(const Vec512<int8_t> a, + const Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> operator-(const Vec512<int16_t> a, + const Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> operator-(const Vec512<int32_t> a, + const Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> operator-(const Vec512<int64_t> a, + const Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512<float> operator-(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> operator-(const Vec512<double> a, + const Vec512<double> b) { + return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +HWY_API Vec512<uint64_t> SumsOf8(const Vec512<uint8_t> v) { + const Full512<uint8_t> d; + return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, Zero(d).raw)}; +} + +HWY_API Vec512<uint64_t> SumsOf8AbsDiff(const Vec512<uint8_t> a, + const Vec512<uint8_t> b) { + return Vec512<uint64_t>{_mm512_sad_epu8(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec512<uint8_t> SaturatedAdd(const Vec512<uint8_t> a, + const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> SaturatedAdd(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int8_t> SaturatedAdd(const Vec512<int8_t> a, + const Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> SaturatedAdd(const Vec512<int16_t> a, + const Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec512<uint8_t> SaturatedSub(const Vec512<uint8_t> a, + const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> SaturatedSub(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int8_t> SaturatedSub(const Vec512<int8_t> a, + const Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> SaturatedSub(const Vec512<int16_t> a, + const Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)}; +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +HWY_API Vec512<uint8_t> AverageRound(const Vec512<uint8_t> a, + const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> AverageRound(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)}; +} + +// ------------------------------ Abs (Sub) + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec512<int8_t> Abs(const Vec512<int8_t> v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (untested due to internal compiler error) + const DFromV<decltype(v)> d; + const auto zero = Zero(d); + return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)}; +#else + return Vec512<int8_t>{_mm512_abs_epi8(v.raw)}; +#endif +} +HWY_API Vec512<int16_t> Abs(const Vec512<int16_t> v) { + return Vec512<int16_t>{_mm512_abs_epi16(v.raw)}; +} +HWY_API Vec512<int32_t> Abs(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_abs_epi32(v.raw)}; +} +HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) { + return Vec512<int64_t>{_mm512_abs_epi64(v.raw)}; +} + +// These aren't native instructions, they also involve AND with constant. +HWY_API Vec512<float> Abs(const Vec512<float> v) { + return Vec512<float>{_mm512_abs_ps(v.raw)}; +} +HWY_API Vec512<double> Abs(const Vec512<double> v) { +// Workaround: _mm512_abs_pd expects __m512, so implement it ourselves. +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 803 + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + return And(v, BitCast(d, Set(du, 0x7FFFFFFFFFFFFFFFULL))); +#else + return Vec512<double>{_mm512_abs_pd(v.raw)}; +#endif +} +// ------------------------------ ShiftLeft + +template <int kBits> +HWY_API Vec512<uint16_t> ShiftLeft(const Vec512<uint16_t> v) { + return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<uint32_t> ShiftLeft(const Vec512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<uint64_t> ShiftLeft(const Vec512<uint64_t> v) { + return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<int16_t> ShiftLeft(const Vec512<int16_t> v) { + return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<int32_t> ShiftLeft(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<int64_t> ShiftLeft(const Vec512<int64_t> v) { + return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)}; +} + +template <int kBits, typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) { + const DFromV<decltype(v)> d8; + const RepartitionToWide<decltype(d8)> d16; + const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v))); + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); +} + +// ------------------------------ ShiftRight + +template <int kBits> +HWY_API Vec512<uint16_t> ShiftRight(const Vec512<uint16_t> v) { + return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<uint32_t> ShiftRight(const Vec512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<uint64_t> ShiftRight(const Vec512<uint64_t> v) { + return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) { + const DFromV<decltype(v)> d8; + // Use raw instead of BitCast to support N=1. + const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template <int kBits> +HWY_API Vec512<int16_t> ShiftRight(const Vec512<int16_t> v) { + return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<int32_t> ShiftRight(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<int64_t> ShiftRight(const Vec512<int64_t> v) { + return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ RotateRight + +template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> +HWY_API Vec512<T> RotateRight(const Vec512<T> v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + // AVX3 does not support 8/16-bit. + return Or(ShiftRight<kBits>(v), + ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); +} + +template <int kBits> +HWY_API Vec512<uint32_t> RotateRight(const Vec512<uint32_t> v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); + if (kBits == 0) return v; + return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)}; +} + +template <int kBits> +HWY_API Vec512<uint64_t> RotateRight(const Vec512<uint64_t> v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); + if (kBits == 0) return v; + return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)}; +} + +// ------------------------------ ShiftLeftSame + +// GCC and older Clang do not follow the Intel documentation for AVX-512 +// shift-with-immediate: the counts should all be unsigned int. +#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100 +using Shift16Count = int; +using Shift3264Count = int; +#elif HWY_COMPILER_GCC_ACTUAL +// GCC 11.0 requires these, prior versions used a macro+cast and don't care. +using Shift16Count = int; +using Shift3264Count = unsigned int; +#else +// Assume documented behavior. Clang 11 and MSVC 14.28.29910 match this. +using Shift16Count = unsigned int; +using Shift3264Count = unsigned int; +#endif + +HWY_API Vec512<uint16_t> ShiftLeftSame(const Vec512<uint16_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<uint16_t>{ + _mm512_slli_epi16(v.raw, static_cast<Shift16Count>(bits))}; + } +#endif + return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512<uint32_t> ShiftLeftSame(const Vec512<uint32_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<uint32_t>{ + _mm512_slli_epi32(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512<uint64_t> ShiftLeftSame(const Vec512<uint64_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<uint64_t>{ + _mm512_slli_epi64(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512<int16_t> ShiftLeftSame(const Vec512<int16_t> v, const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<int16_t>{ + _mm512_slli_epi16(v.raw, static_cast<Shift16Count>(bits))}; + } +#endif + return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512<int32_t> ShiftLeftSame(const Vec512<int32_t> v, const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<int32_t>{ + _mm512_slli_epi32(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512<int64_t> ShiftLeftSame(const Vec512<int64_t> v, const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<int64_t>{ + _mm512_slli_epi64(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) { + const DFromV<decltype(v)> d8; + const RepartitionToWide<decltype(d8)> d16; + const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); +} + +// ------------------------------ ShiftRightSame + +HWY_API Vec512<uint16_t> ShiftRightSame(const Vec512<uint16_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<uint16_t>{ + _mm512_srli_epi16(v.raw, static_cast<Shift16Count>(bits))}; + } +#endif + return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512<uint32_t> ShiftRightSame(const Vec512<uint32_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<uint32_t>{ + _mm512_srli_epi32(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512<uint64_t> ShiftRightSame(const Vec512<uint64_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<uint64_t>{ + _mm512_srli_epi64(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512<uint8_t> ShiftRightSame(Vec512<uint8_t> v, const int bits) { + const DFromV<decltype(v)> d8; + const RepartitionToWide<decltype(d8)> d16; + const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits)); +} + +HWY_API Vec512<int16_t> ShiftRightSame(const Vec512<int16_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<int16_t>{ + _mm512_srai_epi16(v.raw, static_cast<Shift16Count>(bits))}; + } +#endif + return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512<int32_t> ShiftRightSame(const Vec512<int32_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<int32_t>{ + _mm512_srai_epi32(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512<int64_t> ShiftRightSame(const Vec512<int64_t> v, + const int bits) { +#if HWY_COMPILER_GCC + if (__builtin_constant_p(bits)) { + return Vec512<int64_t>{ + _mm512_srai_epi64(v.raw, static_cast<Shift3264Count>(bits))}; + } +#endif + return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = + BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits))); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Minimum + +// Unsigned +HWY_API Vec512<uint8_t> Min(const Vec512<uint8_t> a, const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> Min(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> Min(const Vec512<uint32_t> a, + const Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> Min(const Vec512<uint64_t> a, + const Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int8_t> Min(const Vec512<int8_t> a, const Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> Min(const Vec512<int16_t> a, const Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> Min(const Vec512<int32_t> a, const Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> Min(const Vec512<int64_t> a, const Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512<float> Min(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_min_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> Min(const Vec512<double> a, const Vec512<double> b) { + return Vec512<double>{_mm512_min_pd(a.raw, b.raw)}; +} + +// ------------------------------ Maximum + +// Unsigned +HWY_API Vec512<uint8_t> Max(const Vec512<uint8_t> a, const Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> Max(const Vec512<uint16_t> a, + const Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> Max(const Vec512<uint32_t> a, + const Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> Max(const Vec512<uint64_t> a, + const Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int8_t> Max(const Vec512<int8_t> a, const Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> Max(const Vec512<int16_t> a, const Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> Max(const Vec512<int32_t> a, const Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> Max(const Vec512<int64_t> a, const Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512<float> Max(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_max_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> Max(const Vec512<double> a, const Vec512<double> b) { + return Vec512<double>{_mm512_max_pd(a.raw, b.raw)}; +} + +// ------------------------------ Integer multiplication + +// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*. +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +// Unsigned +HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)}; +} +HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) { + return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a, + Vec128<uint64_t, N> b) { + return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)}; +} +HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) { + return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)}; +} +template <size_t N> +HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a, + Vec128<int64_t, N> b) { + return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)}; +} +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> MulHigh(Vec512<int16_t> a, Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)}; +} + +HWY_API Vec512<int16_t> MulFixedPoint15(Vec512<int16_t> a, Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +HWY_API Vec512<int64_t> MulEven(Vec512<int32_t> a, Vec512<int32_t> b) { + return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> MulEven(Vec512<uint32_t> a, Vec512<uint32_t> b) { + return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)}; +} + +// ------------------------------ Neg (Sub) + +template <typename T, HWY_IF_FLOAT(T)> +HWY_API Vec512<T> Neg(const Vec512<T> v) { + const DFromV<decltype(v)> d; + return Xor(v, SignBit(d)); +} + +template <typename T, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec512<T> Neg(const Vec512<T> v) { + const DFromV<decltype(v)> d; + return Zero(d) - v; +} + +// ------------------------------ Floating-point mul / div + +HWY_API Vec512<float> operator*(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> operator*(const Vec512<double> a, + const Vec512<double> b) { + return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)}; +} + +HWY_API Vec512<float> operator/(const Vec512<float> a, const Vec512<float> b) { + return Vec512<float>{_mm512_div_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> operator/(const Vec512<double> a, + const Vec512<double> b) { + return Vec512<double>{_mm512_div_pd(a.raw, b.raw)}; +} + +// Approximate reciprocal +HWY_API Vec512<float> ApproximateReciprocal(const Vec512<float> v) { + return Vec512<float>{_mm512_rcp14_ps(v.raw)}; +} + +// Absolute value of difference. +HWY_API Vec512<float> AbsDiff(const Vec512<float> a, const Vec512<float> b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +HWY_API Vec512<float> MulAdd(const Vec512<float> mul, const Vec512<float> x, + const Vec512<float> add) { + return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)}; +} +HWY_API Vec512<double> MulAdd(const Vec512<double> mul, const Vec512<double> x, + const Vec512<double> add) { + return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)}; +} + +// Returns add - mul * x +HWY_API Vec512<float> NegMulAdd(const Vec512<float> mul, const Vec512<float> x, + const Vec512<float> add) { + return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)}; +} +HWY_API Vec512<double> NegMulAdd(const Vec512<double> mul, + const Vec512<double> x, + const Vec512<double> add) { + return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)}; +} + +// Returns mul * x - sub +HWY_API Vec512<float> MulSub(const Vec512<float> mul, const Vec512<float> x, + const Vec512<float> sub) { + return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)}; +} +HWY_API Vec512<double> MulSub(const Vec512<double> mul, const Vec512<double> x, + const Vec512<double> sub) { + return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)}; +} + +// Returns -mul * x - sub +HWY_API Vec512<float> NegMulSub(const Vec512<float> mul, const Vec512<float> x, + const Vec512<float> sub) { + return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)}; +} +HWY_API Vec512<double> NegMulSub(const Vec512<double> mul, + const Vec512<double> x, + const Vec512<double> sub) { + return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)}; +} + +// ------------------------------ Floating-point square root + +// Full precision square root +HWY_API Vec512<float> Sqrt(const Vec512<float> v) { + return Vec512<float>{_mm512_sqrt_ps(v.raw)}; +} +HWY_API Vec512<double> Sqrt(const Vec512<double> v) { + return Vec512<double>{_mm512_sqrt_pd(v.raw)}; +} + +// Approximate reciprocal square root +HWY_API Vec512<float> ApproximateReciprocalSqrt(const Vec512<float> v) { + return Vec512<float>{_mm512_rsqrt14_ps(v.raw)}; +} + +// ------------------------------ Floating-point rounding + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +// Toward nearest integer, tie to even +HWY_API Vec512<float> Round(const Vec512<float> v) { + return Vec512<float>{_mm512_roundscale_ps( + v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512<double> Round(const Vec512<double> v) { + return Vec512<double>{_mm512_roundscale_pd( + v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} + +// Toward zero, aka truncate +HWY_API Vec512<float> Trunc(const Vec512<float> v) { + return Vec512<float>{ + _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512<double> Trunc(const Vec512<double> v) { + return Vec512<double>{ + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} + +// Toward +infinity, aka ceiling +HWY_API Vec512<float> Ceil(const Vec512<float> v) { + return Vec512<float>{ + _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512<double> Ceil(const Vec512<double> v) { + return Vec512<double>{ + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} + +// Toward -infinity, aka floor +HWY_API Vec512<float> Floor(const Vec512<float> v) { + return Vec512<float>{ + _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512<double> Floor(const Vec512<double> v) { + return Vec512<double>{ + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== COMPARE + +// Comparisons set a mask bit to 1 if the condition is true, else 0. + +template <typename TFrom, class DTo, typename TTo = TFromD<DTo>> +HWY_API Mask512<TTo> RebindMask(DTo /*tag*/, Mask512<TFrom> m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask512<TTo>{m.raw}; +} + +namespace detail { + +template <typename T> +HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<1> /*tag*/, const Vec512<T> v, + const Vec512<T> bit) { + return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)}; +} +template <typename T> +HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<2> /*tag*/, const Vec512<T> v, + const Vec512<T> bit) { + return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)}; +} +template <typename T> +HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<4> /*tag*/, const Vec512<T> v, + const Vec512<T> bit) { + return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)}; +} +template <typename T> +HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<8> /*tag*/, const Vec512<T> v, + const Vec512<T> bit) { + return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Mask512<T> TestBit(const Vec512<T> v, const Vec512<T> bit) { + static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); + return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit); +} + +// ------------------------------ Equality + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI32(T)> +HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI64(T)> +HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512<float> operator==(Vec512<float> a, Vec512<float> b) { + return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +HWY_API Mask512<double> operator==(Vec512<double> a, Vec512<double> b) { + return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI32(T)> +HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)}; +} +template <typename T, HWY_IF_UI64(T)> +HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) { + return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512<float> operator!=(Vec512<float> a, Vec512<float> b) { + return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +HWY_API Mask512<double> operator!=(Vec512<double> a, Vec512<double> b) { + return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +HWY_API Mask512<uint8_t> operator>(Vec512<uint8_t> a, Vec512<uint8_t> b) { + return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)}; +} +HWY_API Mask512<uint16_t> operator>(Vec512<uint16_t> a, Vec512<uint16_t> b) { + return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)}; +} +HWY_API Mask512<uint32_t> operator>(Vec512<uint32_t> a, Vec512<uint32_t> b) { + return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)}; +} +HWY_API Mask512<uint64_t> operator>(Vec512<uint64_t> a, Vec512<uint64_t> b) { + return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512<int8_t> operator>(Vec512<int8_t> a, Vec512<int8_t> b) { + return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)}; +} +HWY_API Mask512<int16_t> operator>(Vec512<int16_t> a, Vec512<int16_t> b) { + return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)}; +} +HWY_API Mask512<int32_t> operator>(Vec512<int32_t> a, Vec512<int32_t> b) { + return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)}; +} +HWY_API Mask512<int64_t> operator>(Vec512<int64_t> a, Vec512<int64_t> b) { + return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512<float> operator>(Vec512<float> a, Vec512<float> b) { + return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} +HWY_API Mask512<double> operator>(Vec512<double> a, Vec512<double> b) { + return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} + +// ------------------------------ Weak inequality + +HWY_API Mask512<float> operator>=(Vec512<float> a, Vec512<float> b) { + return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} +HWY_API Mask512<double> operator>=(Vec512<double> a, Vec512<double> b) { + return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} + +HWY_API Mask512<uint8_t> operator>=(Vec512<uint8_t> a, Vec512<uint8_t> b) { + return Mask512<uint8_t>{_mm512_cmpge_epu8_mask(a.raw, b.raw)}; +} +HWY_API Mask512<uint16_t> operator>=(Vec512<uint16_t> a, Vec512<uint16_t> b) { + return Mask512<uint16_t>{_mm512_cmpge_epu16_mask(a.raw, b.raw)}; +} +HWY_API Mask512<uint32_t> operator>=(Vec512<uint32_t> a, Vec512<uint32_t> b) { + return Mask512<uint32_t>{_mm512_cmpge_epu32_mask(a.raw, b.raw)}; +} +HWY_API Mask512<uint64_t> operator>=(Vec512<uint64_t> a, Vec512<uint64_t> b) { + return Mask512<uint64_t>{_mm512_cmpge_epu64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512<int8_t> operator>=(Vec512<int8_t> a, Vec512<int8_t> b) { + return Mask512<int8_t>{_mm512_cmpge_epi8_mask(a.raw, b.raw)}; +} +HWY_API Mask512<int16_t> operator>=(Vec512<int16_t> a, Vec512<int16_t> b) { + return Mask512<int16_t>{_mm512_cmpge_epi16_mask(a.raw, b.raw)}; +} +HWY_API Mask512<int32_t> operator>=(Vec512<int32_t> a, Vec512<int32_t> b) { + return Mask512<int32_t>{_mm512_cmpge_epi32_mask(a.raw, b.raw)}; +} +HWY_API Mask512<int64_t> operator>=(Vec512<int64_t> a, Vec512<int64_t> b) { + return Mask512<int64_t>{_mm512_cmpge_epi64_mask(a.raw, b.raw)}; +} + +// ------------------------------ Reversed comparisons + +template <typename T> +HWY_API Mask512<T> operator<(Vec512<T> a, Vec512<T> b) { + return b > a; +} + +template <typename T> +HWY_API Mask512<T> operator<=(Vec512<T> a, Vec512<T> b) { + return b >= a; +} + +// ------------------------------ Mask + +namespace detail { + +template <typename T> +HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec512<T> v) { + return Mask512<T>{_mm512_movepi8_mask(v.raw)}; +} +template <typename T> +HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec512<T> v) { + return Mask512<T>{_mm512_movepi16_mask(v.raw)}; +} +template <typename T> +HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec512<T> v) { + return Mask512<T>{_mm512_movepi32_mask(v.raw)}; +} +template <typename T> +HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec512<T> v) { + return Mask512<T>{_mm512_movepi64_mask(v.raw)}; +} + +} // namespace detail + +template <typename T> +HWY_API Mask512<T> MaskFromVec(const Vec512<T> v) { + return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v); +} +// There do not seem to be native floating-point versions of these instructions. +HWY_API Mask512<float> MaskFromVec(const Vec512<float> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return Mask512<float>{MaskFromVec(BitCast(di, v)).raw}; +} +HWY_API Mask512<double> MaskFromVec(const Vec512<double> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return Mask512<double>{MaskFromVec(BitCast(di, v)).raw}; +} + +HWY_API Vec512<uint8_t> VecFromMask(const Mask512<uint8_t> v) { + return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)}; +} +HWY_API Vec512<int8_t> VecFromMask(const Mask512<int8_t> v) { + return Vec512<int8_t>{_mm512_movm_epi8(v.raw)}; +} + +HWY_API Vec512<uint16_t> VecFromMask(const Mask512<uint16_t> v) { + return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)}; +} +HWY_API Vec512<int16_t> VecFromMask(const Mask512<int16_t> v) { + return Vec512<int16_t>{_mm512_movm_epi16(v.raw)}; +} + +HWY_API Vec512<uint32_t> VecFromMask(const Mask512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)}; +} +HWY_API Vec512<int32_t> VecFromMask(const Mask512<int32_t> v) { + return Vec512<int32_t>{_mm512_movm_epi32(v.raw)}; +} +HWY_API Vec512<float> VecFromMask(const Mask512<float> v) { + return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))}; +} + +HWY_API Vec512<uint64_t> VecFromMask(const Mask512<uint64_t> v) { + return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)}; +} +HWY_API Vec512<int64_t> VecFromMask(const Mask512<int64_t> v) { + return Vec512<int64_t>{_mm512_movm_epi64(v.raw)}; +} +HWY_API Vec512<double> VecFromMask(const Mask512<double> v) { + return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))}; +} + +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> VecFromMask(D /* tag */, const Mask512<T> v) { + return VecFromMask(v); +} + +// ------------------------------ Mask logical + +namespace detail { + +template <typename T> +HWY_INLINE Mask512<T> Not(hwy::SizeTag<1> /*tag*/, const Mask512<T> m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_knot_mask64(m.raw)}; +#else + return Mask512<T>{~m.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Not(hwy::SizeTag<2> /*tag*/, const Mask512<T> m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_knot_mask32(m.raw)}; +#else + return Mask512<T>{~m.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Not(hwy::SizeTag<4> /*tag*/, const Mask512<T> m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_knot_mask16(m.raw)}; +#else + return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Not(hwy::SizeTag<8> /*tag*/, const Mask512<T> m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_knot_mask8(m.raw)}; +#else + return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)}; +#endif +} + +template <typename T> +HWY_INLINE Mask512<T> And(hwy::SizeTag<1> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kand_mask64(a.raw, b.raw)}; +#else + return Mask512<T>{a.raw & b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> And(hwy::SizeTag<2> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kand_mask32(a.raw, b.raw)}; +#else + return Mask512<T>{a.raw & b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> And(hwy::SizeTag<4> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kand_mask16(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> And(hwy::SizeTag<8> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kand_mask8(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<1> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kandn_mask64(a.raw, b.raw)}; +#else + return Mask512<T>{~a.raw & b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<2> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kandn_mask32(a.raw, b.raw)}; +#else + return Mask512<T>{~a.raw & b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<4> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kandn_mask16(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<8> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask512<T> Or(hwy::SizeTag<1> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kor_mask64(a.raw, b.raw)}; +#else + return Mask512<T>{a.raw | b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Or(hwy::SizeTag<2> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kor_mask32(a.raw, b.raw)}; +#else + return Mask512<T>{a.raw | b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Or(hwy::SizeTag<4> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kor_mask16(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Or(hwy::SizeTag<8> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kor_mask8(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask512<T> Xor(hwy::SizeTag<1> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxor_mask64(a.raw, b.raw)}; +#else + return Mask512<T>{a.raw ^ b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Xor(hwy::SizeTag<2> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxor_mask32(a.raw, b.raw)}; +#else + return Mask512<T>{a.raw ^ b.raw}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Xor(hwy::SizeTag<4> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxor_mask16(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask512<T> a, + const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)}; +#endif +} + +template <typename T> +HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/, + const Mask512<T> a, const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxnor_mask64(a.raw, b.raw)}; +#else + return Mask512<T>{~(a.raw ^ b.raw)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/, + const Mask512<T> a, const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxnor_mask32(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/, + const Mask512<T> a, const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxnor_mask16(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; +#endif +} +template <typename T> +HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/, + const Mask512<T> a, const Mask512<T> b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512<T>{_kxnor_mask8(a.raw, b.raw)}; +#else + return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; +#endif +} + +} // namespace detail + +template <typename T> +HWY_API Mask512<T> Not(const Mask512<T> m) { + return detail::Not(hwy::SizeTag<sizeof(T)>(), m); +} + +template <typename T> +HWY_API Mask512<T> And(const Mask512<T> a, Mask512<T> b) { + return detail::And(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask512<T> AndNot(const Mask512<T> a, Mask512<T> b) { + return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask512<T> Or(const Mask512<T> a, Mask512<T> b) { + return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask512<T> Xor(const Mask512<T> a, Mask512<T> b) { + return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b); +} + +template <typename T> +HWY_API Mask512<T> ExclusiveNeither(const Mask512<T> a, Mask512<T> b) { + return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b); +} + +// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) + +HWY_API Vec512<int8_t> BroadcastSignBit(const Vec512<int8_t> v) { + const DFromV<decltype(v)> d; + return VecFromMask(v < Zero(d)); +} + +HWY_API Vec512<int16_t> BroadcastSignBit(const Vec512<int16_t> v) { + return ShiftRight<15>(v); +} + +HWY_API Vec512<int32_t> BroadcastSignBit(const Vec512<int32_t> v) { + return ShiftRight<31>(v); +} + +HWY_API Vec512<int64_t> BroadcastSignBit(const Vec512<int64_t> v) { + return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)}; +} + +// ------------------------------ Floating-point classification (Not) + +HWY_API Mask512<float> IsNaN(const Vec512<float> v) { + return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x81)}; +} +HWY_API Mask512<double> IsNaN(const Vec512<double> v) { + return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x81)}; +} + +HWY_API Mask512<float> IsInf(const Vec512<float> v) { + return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x18)}; +} +HWY_API Mask512<double> IsInf(const Vec512<double> v) { + return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x18)}; +} + +// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for +// positive, so we have to check for inf/NaN and negate. +HWY_API Mask512<float> IsFinite(const Vec512<float> v) { + return Not(Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x99)}); +} +HWY_API Mask512<double> IsFinite(const Vec512<double> v) { + return Not(Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x99)}); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API Vec512<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { + return Vec512<T>{_mm512_load_si512(aligned)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) { + return Vec512<float>{_mm512_load_ps(aligned)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) { + return Vec512<double>{_mm512_load_pd(aligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API Vec512<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_loadu_si512(p)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { + return Vec512<float>{_mm512_loadu_ps(p)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { + return Vec512<double>{_mm512_loadu_pd(p)}; +} + +// ------------------------------ MaskedLoad + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> MaskedLoad(Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> MaskedLoad(Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API Vec512<T> MaskedLoad(Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API Vec512<T> MaskedLoad(Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<float> MaskedLoad(Mask512<float> m, D /* tag */, + const float* HWY_RESTRICT p) { + return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<double> MaskedLoad(Mask512<double> m, D /* tag */, + const double* HWY_RESTRICT p) { + return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)}; +} + +// ------------------------------ MaskedLoadOr + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> MaskedLoadOr(VFromD<D> v, Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_mask_loadu_epi8(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> MaskedLoadOr(VFromD<D> v, Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_mask_loadu_epi16(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API Vec512<T> MaskedLoadOr(VFromD<D> v, Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_mask_loadu_epi32(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API Vec512<T> MaskedLoadOr(VFromD<D> v, Mask512<T> m, D /* tag */, + const T* HWY_RESTRICT p) { + return Vec512<T>{_mm512_mask_loadu_epi64(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<float> MaskedLoadOr(VFromD<D> v, Mask512<float> m, D /* tag */, + const float* HWY_RESTRICT p) { + return Vec512<float>{_mm512_mask_loadu_ps(v.raw, m.raw, p)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<double> MaskedLoadOr(VFromD<D> v, Mask512<double> m, D /* tag */, + const double* HWY_RESTRICT p) { + return Vec512<double>{_mm512_mask_loadu_pd(v.raw, m.raw, p)}; +} + +// ------------------------------ LoadDup128 + +// Loads 128 bit and duplicates into both 128-bit halves. This avoids the +// 3-cycle cost of moving data between 128-bit halves and avoids port 5. +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API Vec512<T> LoadDup128(D /* tag */, const T* const HWY_RESTRICT p) { + const Full128<T> d128; + return Vec512<T>{_mm512_broadcast_i32x4(LoadU(d128, p).raw)}; +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<float> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) { + const __m128 x4 = _mm_loadu_ps(p); + return Vec512<float>{_mm512_broadcast_f32x4(x4)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<double> LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { + const __m128d x2 = _mm_loadu_pd(p); + return Vec512<double>{_mm512_broadcast_f64x2(x2)}; +} + +// ------------------------------ Store + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API void Store(Vec512<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void Store(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) { + _mm512_store_ps(aligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void Store(Vec512<double> v, D /* tag */, + double* HWY_RESTRICT aligned) { + _mm512_store_pd(aligned, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API void StoreU(Vec512<T> v, D /* tag */, T* HWY_RESTRICT p) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void StoreU(Vec512<float> v, D /* tag */, float* HWY_RESTRICT p) { + _mm512_storeu_ps(p, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void StoreU(Vec512<double> v, D /* tag */, double* HWY_RESTRICT p) { + _mm512_storeu_pd(p, v.raw); +} + +// ------------------------------ BlendedStore + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 1)> +HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi8(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_T_SIZE(T, 2)> +HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi16(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_UI32(T)> +HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi32(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_UI64(T)> +HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, D /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi64(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void BlendedStore(Vec512<float> v, Mask512<float> m, D /* tag */, + float* HWY_RESTRICT p) { + _mm512_mask_storeu_ps(p, m.raw, v.raw); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void BlendedStore(Vec512<double> v, Mask512<double> m, D /* tag */, + double* HWY_RESTRICT p) { + _mm512_mask_storeu_pd(p, m.raw, v.raw); +} + +// ------------------------------ Non-temporal stores + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, + HWY_IF_NOT_FLOAT(T)> +HWY_API void Stream(Vec512<T> v, D /* tag */, T* HWY_RESTRICT aligned) { + _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void Stream(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) { + _mm512_stream_ps(aligned, v.raw); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void Stream(Vec512<double> v, D /* tag */, + double* HWY_RESTRICT aligned) { + _mm512_stream_pd(aligned, v.raw); +} + +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +namespace detail { + +template <typename T> +HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512<T> v, + T* HWY_RESTRICT base, Vec512<int32_t> offset) { + _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1); +} +template <typename T> +HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512<T> v, + T* HWY_RESTRICT base, Vec512<int32_t> index) { + _mm512_i32scatter_epi32(base, index.raw, v.raw, 4); +} + +template <typename T> +HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512<T> v, + T* HWY_RESTRICT base, Vec512<int64_t> offset) { + _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1); +} +template <typename T> +HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512<T> v, + T* HWY_RESTRICT base, Vec512<int64_t> index) { + _mm512_i64scatter_epi64(base, index.raw, v.raw, 8); +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI, typename T = TFromD<D>> +HWY_API void ScatterOffset(Vec512<T> v, D /* tag */, T* HWY_RESTRICT base, + Vec512<TI> offset) { + static_assert(sizeof(T) == sizeof(TI), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, base, offset); +} +template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI, typename T = TFromD<D>> +HWY_API void ScatterIndex(Vec512<T> v, D /* tag */, T* HWY_RESTRICT base, + Vec512<TI> index) { + static_assert(sizeof(T) == sizeof(TI), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, base, index); +} + +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void ScatterOffset(Vec512<float> v, D /* tag */, + float* HWY_RESTRICT base, Vec512<int32_t> offset) { + _mm512_i32scatter_ps(base, offset.raw, v.raw, 1); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void ScatterIndex(Vec512<float> v, D /* tag */, + float* HWY_RESTRICT base, Vec512<int32_t> index) { + _mm512_i32scatter_ps(base, index.raw, v.raw, 4); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void ScatterOffset(Vec512<double> v, D /* tag */, + double* HWY_RESTRICT base, Vec512<int64_t> offset) { + _mm512_i64scatter_pd(base, offset.raw, v.raw, 1); +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API void ScatterIndex(Vec512<double> v, D /* tag */, + double* HWY_RESTRICT base, Vec512<int64_t> index) { + _mm512_i64scatter_pd(base, index.raw, v.raw, 8); +} + +// ------------------------------ Gather + +namespace detail { + +template <int kScale, typename T, HWY_IF_UI32(T), HWY_IF_NOT_FLOAT(T)> +HWY_INLINE Vec512<T> NativeGather(const T* HWY_RESTRICT base, + Vec512<int32_t> index) { + return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, kScale)}; +} + +template <int kScale, typename T, HWY_IF_UI64(T), HWY_IF_NOT_FLOAT(T)> +HWY_INLINE Vec512<T> NativeGather(const T* HWY_RESTRICT base, + Vec512<int64_t> index) { + return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, kScale)}; +} + +template <int kScale> +HWY_INLINE Vec512<float> NativeGather(const float* HWY_RESTRICT base, + Vec512<int32_t> index) { + return Vec512<float>{_mm512_i32gather_ps(index.raw, base, kScale)}; +} + +template <int kScale> +HWY_INLINE Vec512<double> NativeGather(const double* HWY_RESTRICT base, + Vec512<int64_t> index) { + return Vec512<double>{_mm512_i64gather_pd(index.raw, base, kScale)}; +} + +} // namespace detail + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, typename TI> +HWY_API Vec512<T> GatherOffset(D /* tag */, const T* HWY_RESTRICT base, + Vec512<TI> offset) { + static_assert(sizeof(T) == sizeof(TI), "Must match for portability"); + return detail::NativeGather<1>(base, offset); +} +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, typename TI> +HWY_API Vec512<T> GatherIndex(D /* tag */, const T* HWY_RESTRICT base, + Vec512<TI> index) { + static_assert(sizeof(T) == sizeof(TI), "Must match for portability"); + return detail::NativeGather<sizeof(T)>(base, index); +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== SWIZZLE + +// ------------------------------ LowerHalf + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_FLOAT(T)> +HWY_API Vec256<T> LowerHalf(D /* tag */, Vec512<T> v) { + return Vec256<T>{_mm512_castsi512_si256(v.raw)}; +} +template <class D> +HWY_API Vec256<float> LowerHalf(D /* tag */, Vec512<float> v) { + return Vec256<float>{_mm512_castps512_ps256(v.raw)}; +} +template <class D> +HWY_API Vec256<double> LowerHalf(D /* tag */, Vec512<double> v) { + return Vec256<double>{_mm512_castpd512_pd256(v.raw)}; +} + +template <typename T> +HWY_API Vec256<T> LowerHalf(Vec512<T> v) { + const Half<DFromV<decltype(v)>> dh; + return LowerHalf(dh, v); +} + +// ------------------------------ UpperHalf + +template <class D, typename T = TFromD<D>> +HWY_API Vec256<T> UpperHalf(D /* tag */, Vec512<T> v) { + return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)}; +} +template <class D> +HWY_API Vec256<float> UpperHalf(D /* tag */, Vec512<float> v) { + return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)}; +} +template <class D> +HWY_API Vec256<double> UpperHalf(D /* tag */, Vec512<double> v) { + return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)}; +} + +// ------------------------------ ExtractLane (Store) +template <typename T> +HWY_API T ExtractLane(const Vec512<T> v, size_t i) { + const DFromV<decltype(v)> d; + HWY_DASSERT(i < Lanes(d)); + alignas(64) T lanes[64 / sizeof(T)]; + Store(v, d, lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane (Store) +template <typename T> +HWY_API Vec512<T> InsertLane(const Vec512<T> v, size_t i, T t) { + return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); +} + +// ------------------------------ GetLane (LowerHalf) +template <typename T> +HWY_API T GetLane(const Vec512<T> v) { + return GetLane(LowerHalf(v)); +} + +// ------------------------------ ZeroExtendVector + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API Vec512<T> ZeroExtendVector(D d, Vec256<T> lo) { +#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h. + (void)d; + return Vec512<T>{_mm512_zextsi256_si512(lo.raw)}; +#else + return Vec512<T>{_mm512_inserti32x8(Zero(d).raw, lo.raw, 0)}; +#endif +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<float> ZeroExtendVector(D d, Vec256<float> lo) { +#if HWY_HAVE_ZEXT + (void)d; + return Vec512<float>{_mm512_zextps256_ps512(lo.raw)}; +#else + return Vec512<float>{_mm512_insertf32x8(Zero(d).raw, lo.raw, 0)}; +#endif +} +template <class D, HWY_IF_V_SIZE_D(D, 64)> +HWY_API Vec512<double> ZeroExtendVector(D d, Vec256<double> lo) { +#if HWY_HAVE_ZEXT + (void)d; + return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)}; +#else + return Vec512<double>{_mm512_insertf64x4(Zero(d).raw, lo.raw, 0)}; +#endif +} + +// ------------------------------ ZeroExtendResizeBitCast + +namespace detail { + +template <class DTo, class DFrom, HWY_IF_NOT_FLOAT_D(DTo)> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, + DTo d_to, DFrom d_from, VFromD<DFrom> v) { + const Repartition<uint8_t, decltype(d_from)> du8_from; + const auto vu8 = BitCast(du8_from, v); +#if HWY_HAVE_ZEXT + (void)d_to; + return VFromD<DTo>{_mm512_zextsi128_si512(vu8.raw)}; +#else + return VFromD<DTo>{_mm512_inserti32x4(Zero(d_to).raw, vu8.raw, 0)}; +#endif +} + +template <class DTo, class DFrom, HWY_IF_F32_D(DTo)> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, + DTo d_to, DFrom d_from, VFromD<DFrom> v) { + const Repartition<float, decltype(d_from)> df32_from; + const auto vf32 = BitCast(df32_from, v); +#if HWY_HAVE_ZEXT + (void)d_to; + return Vec512<float>{_mm512_zextps128_ps512(vf32.raw)}; +#else + return Vec512<float>{_mm512_insertf32x4(Zero(d_to).raw, vf32.raw, 0)}; +#endif +} + +template <class DTo, class DFrom, HWY_IF_F64_D(DTo)> +HWY_INLINE Vec512<double> ZeroExtendResizeBitCast( + hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, + DTo d_to, DFrom d_from, VFromD<DFrom> v) { + const Repartition<double, decltype(d_from)> df64_from; + const auto vf64 = BitCast(df64_from, v); +#if HWY_HAVE_ZEXT + (void)d_to; + return Vec512<double>{_mm512_zextpd128_pd512(vf64.raw)}; +#else + return Vec512<double>{_mm512_insertf64x2(Zero(d_to).raw, vf64.raw, 0)}; +#endif +} + +template <class DTo, class DFrom> +HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( + hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, + DTo d_to, DFrom d_from, VFromD<DFrom> v) { + const Twice<decltype(d_from)> dt_from; + return ZeroExtendResizeBitCast(hwy::SizeTag<16>(), hwy::SizeTag<64>(), d_to, + dt_from, ZeroExtendVector(dt_from, v)); +} + +} // namespace detail + +// ------------------------------ Combine + +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> Combine(D d, Vec256<T> hi, Vec256<T> lo) { + const auto lo512 = ZeroExtendVector(d, lo); + return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)}; +} +template <class D> +HWY_API Vec512<float> Combine(D d, Vec256<float> hi, Vec256<float> lo) { + const auto lo512 = ZeroExtendVector(d, lo); + return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)}; +} +template <class D> +HWY_API Vec512<double> Combine(D d, Vec256<double> hi, Vec256<double> lo) { + const auto lo512 = ZeroExtendVector(d, lo); + return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)}; +} + +// ------------------------------ ShiftLeftBytes + +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec512<T> ShiftLeftBytes(D /* tag */, const Vec512<T> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)}; +} + +template <int kBytes, typename T> +HWY_API Vec512<T> ShiftLeftBytes(const Vec512<T> v) { + const DFromV<decltype(v)> d; + return ShiftLeftBytes<kBytes>(d, v); +} + +// ------------------------------ ShiftLeftLanes + +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API Vec512<T> ShiftLeftLanes(D d, const Vec512<T> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); +} + +template <int kLanes, typename T> +HWY_API Vec512<T> ShiftLeftLanes(const Vec512<T> v) { + const DFromV<decltype(v)> d; + return ShiftLeftLanes<kLanes>(d, v); +} + +// ------------------------------ ShiftRightBytes +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec512<T> ShiftRightBytes(D /* tag */, const Vec512<T> v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)}; +} + +// ------------------------------ ShiftRightLanes +template <int kLanes, class D, typename T = TFromD<D>> +HWY_API Vec512<T> ShiftRightLanes(D d, const Vec512<T> v) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v))); +} + +// ------------------------------ CombineShiftRightBytes + +template <int kBytes, class D, typename T = TFromD<D>> +HWY_API Vec512<T> CombineShiftRightBytes(D d, Vec512<T> hi, Vec512<T> lo) { + const Repartition<uint8_t, decltype(d)> d8; + return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8( + BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); +} + +// ------------------------------ Broadcast/splat any lane + +// Unsigned +template <int kLane> +HWY_API Vec512<uint16_t> Broadcast(const Vec512<uint16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)}; + } else { + const __m512i hi = + _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)}; + } +} +template <int kLane> +HWY_API Vec512<uint32_t> Broadcast(const Vec512<uint32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); + return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)}; +} +template <int kLane> +HWY_API Vec512<uint64_t> Broadcast(const Vec512<uint64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA; + return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)}; +} + +// Signed +template <int kLane> +HWY_API Vec512<int16_t> Broadcast(const Vec512<int16_t> v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)}; + } else { + const __m512i hi = + _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)}; + } +} +template <int kLane> +HWY_API Vec512<int32_t> Broadcast(const Vec512<int32_t> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); + return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)}; +} +template <int kLane> +HWY_API Vec512<int64_t> Broadcast(const Vec512<int64_t> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA; + return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)}; +} + +// Float +template <int kLane> +HWY_API Vec512<float> Broadcast(const Vec512<float> v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)}; +} +template <int kLane> +HWY_API Vec512<double> Broadcast(const Vec512<double> v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane); + return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)}; +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is +// least-significant). Shuffle0321 rotates four-lane blocks one lane to the +// right (the previous least-significant lane is now most-significant => +// 47650321). These could also be implemented via CombineShiftRightBytes but +// the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template <typename T, HWY_IF_UI32(T)> +HWY_API Vec512<T> Shuffle2301(const Vec512<T> v) { + return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)}; +} +HWY_API Vec512<float> Shuffle2301(const Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +namespace detail { + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> ShuffleTwo2301(const Vec512<T> a, const Vec512<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + return BitCast( + d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, + _MM_PERM_CDAB)}); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> ShuffleTwo1230(const Vec512<T> a, const Vec512<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + return BitCast( + d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, + _MM_PERM_BCDA)}); +} +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> ShuffleTwo3012(const Vec512<T> a, const Vec512<T> b) { + const DFromV<decltype(a)> d; + const RebindToFloat<decltype(d)> df; + return BitCast( + d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, + _MM_PERM_DABC)}); +} + +} // namespace detail + +// Swap 64-bit halves +HWY_API Vec512<uint32_t> Shuffle1032(const Vec512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512<int32_t> Shuffle1032(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512<float> Shuffle1032(const Vec512<float> v) { + // Shorter encoding than _mm512_permute_ps. + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512<uint64_t> Shuffle01(const Vec512<uint64_t> v) { + return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512<int64_t> Shuffle01(const Vec512<int64_t> v) { + return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512<double> Shuffle01(const Vec512<double> v) { + // Shorter encoding than _mm512_permute_pd. + return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)}; +} + +// Rotate right 32 bits +HWY_API Vec512<uint32_t> Shuffle0321(const Vec512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)}; +} +HWY_API Vec512<int32_t> Shuffle0321(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)}; +} +HWY_API Vec512<float> Shuffle0321(const Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)}; +} +// Rotate left 32 bits +HWY_API Vec512<uint32_t> Shuffle2103(const Vec512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)}; +} +HWY_API Vec512<int32_t> Shuffle2103(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)}; +} +HWY_API Vec512<float> Shuffle2103(const Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)}; +} + +// Reverse +HWY_API Vec512<uint32_t> Shuffle0123(const Vec512<uint32_t> v) { + return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)}; +} +HWY_API Vec512<int32_t> Shuffle0123(const Vec512<int32_t> v) { + return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)}; +} +HWY_API Vec512<float> Shuffle0123(const Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template <typename T> +struct Indices512 { + __m512i raw; +}; + +template <class D, typename T = TFromD<D>, typename TI> +HWY_API Indices512<T> IndicesFromVec(D /* tag */, Vec512<TI> vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const DFromV<decltype(vec)> di; + const RebindToUnsigned<decltype(di)> du; + using TU = MakeUnsigned<T>; + const auto vec_u = BitCast(du, vec); + HWY_DASSERT( + AllTrue(du, Lt(vec_u, Set(du, static_cast<TU>(128 / sizeof(T)))))); +#endif + return Indices512<T>{vec.raw}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>, typename TI> +HWY_API Indices512<T> SetTableIndices(D d, const TI* idx) { + const Rebind<TI, decltype(d)> di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<T>{_mm512_permutexvar_epi8(idx.raw, v.raw)}; +#else + const DFromV<decltype(v)> d; + const Repartition<uint16_t, decltype(d)> du16; + const Vec512<T> idx_vec{idx.raw}; + + const auto bd_sel_mask = + MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); + const auto cd_sel_mask = + MaskFromVec(BitCast(d, ShiftLeft<2>(BitCast(du16, idx_vec)))); + + const Vec512<T> v_a{_mm512_shuffle_i32x4(v.raw, v.raw, 0x00)}; + const Vec512<T> v_b{_mm512_shuffle_i32x4(v.raw, v.raw, 0x55)}; + const Vec512<T> v_c{_mm512_shuffle_i32x4(v.raw, v.raw, 0xAA)}; + const Vec512<T> v_d{_mm512_shuffle_i32x4(v.raw, v.raw, 0xFF)}; + + const auto shuf_a = TableLookupBytes(v_a, idx_vec); + const auto shuf_c = TableLookupBytes(v_c, idx_vec); + const Vec512<T> shuf_ab{_mm512_mask_shuffle_epi8(shuf_a.raw, bd_sel_mask.raw, + v_b.raw, idx_vec.raw)}; + const Vec512<T> shuf_cd{_mm512_mask_shuffle_epi8(shuf_c.raw, bd_sel_mask.raw, + v_d.raw, idx_vec.raw)}; + return IfThenElse(cd_sel_mask, shuf_cd, shuf_ab); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) { + return Vec512<T>{_mm512_permutexvar_epi16(idx.raw, v.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) { + return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) { + return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)}; +} + +HWY_API Vec512<float> TableLookupLanes(Vec512<float> v, Indices512<float> idx) { + return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)}; +} + +HWY_API Vec512<double> TableLookupLanes(Vec512<double> v, + Indices512<double> idx) { + return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b, + Indices512<T> idx) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<T>{_mm512_permutex2var_epi8(a.raw, idx.raw, b.raw)}; +#else + const DFromV<decltype(a)> d; + const auto b_sel_mask = + MaskFromVec(BitCast(d, ShiftLeft<1>(Vec512<uint16_t>{idx.raw}))); + return IfThenElse(b_sel_mask, TableLookupLanes(b, idx), + TableLookupLanes(a, idx)); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b, + Indices512<T> idx) { + return Vec512<T>{_mm512_permutex2var_epi16(a.raw, idx.raw, b.raw)}; +} + +template <typename T, HWY_IF_UI32(T)> +HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b, + Indices512<T> idx) { + return Vec512<T>{_mm512_permutex2var_epi32(a.raw, idx.raw, b.raw)}; +} + +HWY_API Vec512<float> TwoTablesLookupLanes(Vec512<float> a, Vec512<float> b, + Indices512<float> idx) { + return Vec512<float>{_mm512_permutex2var_ps(a.raw, idx.raw, b.raw)}; +} + +template <typename T, HWY_IF_UI64(T)> +HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b, + Indices512<T> idx) { + return Vec512<T>{_mm512_permutex2var_epi64(a.raw, idx.raw, b.raw)}; +} + +HWY_API Vec512<double> TwoTablesLookupLanes(Vec512<double> a, Vec512<double> b, + Indices512<double> idx) { + return Vec512<double>{_mm512_permutex2var_pd(a.raw, idx.raw, b.raw)}; +} + +// ------------------------------ Reverse + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> Reverse(D d, const Vec512<T> v) { +#if HWY_TARGET <= HWY_AVX3_DL + const RebindToSigned<decltype(d)> di; + alignas(64) static constexpr int8_t kReverse[64] = { + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, + 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + const Vec512<int8_t> idx = Load(di, kReverse); + return BitCast( + d, Vec512<int8_t>{_mm512_permutexvar_epi8(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide<decltype(d)> d16; + return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> Reverse(D d, const Vec512<T> v) { + const RebindToSigned<decltype(d)> di; + alignas(64) static constexpr int16_t kReverse[32] = { + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + const Vec512<int16_t> idx = Load(di, kReverse); + return BitCast(d, Vec512<int16_t>{ + _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> Reverse(D d, const Vec512<T> v) { + alignas(64) static constexpr int32_t kReverse[16] = { + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> Reverse(D d, const Vec512<T> v) { + alignas(64) static constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +// ------------------------------ Reverse2 (in x86_128) + +// ------------------------------ Reverse4 + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> Reverse4(D d, const Vec512<T> v) { + const RebindToSigned<decltype(d)> di; + alignas(64) static constexpr int16_t kReverse4[32] = { + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; + const Vec512<int16_t> idx = Load(di, kReverse4); + return BitCast(d, Vec512<int16_t>{ + _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +} + +// 32 bit Reverse4 defined in x86_128. + +template <class D, typename T = TFromD<D>, HWY_IF_UI64(T)> +HWY_API Vec512<T> Reverse4(D /* tag */, const Vec512<T> v) { + return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; +} +template <class D> +HWY_API Vec512<double> Reverse4(D /* tag */, Vec512<double> v) { + return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; +} + +// ------------------------------ Reverse8 + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> Reverse8(D d, const Vec512<T> v) { + const RebindToSigned<decltype(d)> di; + alignas(64) static constexpr int16_t kReverse8[32] = { + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, + 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; + const Vec512<int16_t> idx = Load(di, kReverse8); + return BitCast(d, Vec512<int16_t>{ + _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> Reverse8(D d, const Vec512<T> v) { + const RebindToSigned<decltype(d)> di; + alignas(64) static constexpr int32_t kReverse8[16] = { + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + const Vec512<int32_t> idx = Load(di, kReverse8); + return BitCast(d, Vec512<int32_t>{ + _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)}); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> Reverse8(D d, const Vec512<T> v) { + return Reverse(d, v); +} + +// ------------------------------ ReverseBits + +#if HWY_TARGET <= HWY_AVX3_DL +template <class V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_D(DFromV<V>, 64)> +HWY_API V ReverseBits(V v) { + const Full512<uint64_t> du64; + const auto affine_matrix = Set(du64, 0x8040201008040201u); + return V{_mm512_gf2p8affine_epi64_epi8(v.raw, affine_matrix.raw, 0)}; +} +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +HWY_API Vec512<uint8_t> InterleaveLower(Vec512<uint8_t> a, Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> InterleaveLower(Vec512<uint16_t> a, + Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> InterleaveLower(Vec512<uint32_t> a, + Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> InterleaveLower(Vec512<uint64_t> a, + Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512<int8_t> InterleaveLower(Vec512<int8_t> a, Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> InterleaveLower(Vec512<int16_t> a, Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> InterleaveLower(Vec512<int32_t> a, Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> InterleaveLower(Vec512<int64_t> a, Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512<float> InterleaveLower(Vec512<float> a, Vec512<float> b) { + return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> InterleaveLower(Vec512<double> a, Vec512<double> b) { + return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)}; +} + +// ------------------------------ InterleaveUpper + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec512<uint8_t> InterleaveUpper(Vec512<uint8_t> a, Vec512<uint8_t> b) { + return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<uint16_t> InterleaveUpper(Vec512<uint16_t> a, + Vec512<uint16_t> b) { + return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<uint32_t> InterleaveUpper(Vec512<uint32_t> a, + Vec512<uint32_t> b) { + return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<uint64_t> InterleaveUpper(Vec512<uint64_t> a, + Vec512<uint64_t> b) { + return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512<int8_t> InterleaveUpper(Vec512<int8_t> a, Vec512<int8_t> b) { + return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec512<int16_t> InterleaveUpper(Vec512<int16_t> a, Vec512<int16_t> b) { + return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec512<int32_t> InterleaveUpper(Vec512<int32_t> a, Vec512<int32_t> b) { + return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec512<int64_t> InterleaveUpper(Vec512<int64_t> a, Vec512<int64_t> b) { + return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512<float> InterleaveUpper(Vec512<float> a, Vec512<float> b) { + return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)}; +} +HWY_API Vec512<double> InterleaveUpper(Vec512<double> a, Vec512<double> b) { + return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> InterleaveUpper(D /* tag */, Vec512<T> a, Vec512<T> b) { + return detail::InterleaveUpper(a, b); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template <typename T, typename TW = MakeWide<T>> +HWY_API Vec512<TW> ZipLower(Vec512<T> a, Vec512<T> b) { + const RepartitionToWide<DFromV<decltype(a)>> dw; + return BitCast(dw, InterleaveLower(a, b)); +} +template <class DW, typename T> +HWY_API VFromD<DW> ZipLower(DW dw, Vec512<T> a, Vec512<T> b) { + return BitCast(dw, InterleaveLower(a, b)); +} + +template <class DW, typename T> +HWY_API VFromD<DW> ZipUpper(DW dw, Vec512<T> a, Vec512<T> b) { + const DFromV<decltype(a)> d; + return BitCast(dw, InterleaveUpper(d, a, b)); +} + +// ------------------------------ Concat* halves + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> ConcatLowerLower(D /* tag */, Vec512<T> hi, Vec512<T> lo) { + return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)}; +} +template <class D> +HWY_API Vec512<float> ConcatLowerLower(D /* tag */, Vec512<float> hi, + Vec512<float> lo) { + return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)}; +} +template <class D> +HWY_API Vec512<double> ConcatLowerLower(D /* tag */, Vec512<double> hi, + Vec512<double> lo) { + return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)}; +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> ConcatUpperUpper(D /* tag */, Vec512<T> hi, Vec512<T> lo) { + return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)}; +} +template <class D> +HWY_API Vec512<float> ConcatUpperUpper(D /* tag */, Vec512<float> hi, + Vec512<float> lo) { + return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)}; +} +template <class D> +HWY_API Vec512<double> ConcatUpperUpper(D /* tag */, Vec512<double> hi, + Vec512<double> lo) { + return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)}; +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> ConcatLowerUpper(D /* tag */, Vec512<T> hi, Vec512<T> lo) { + return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; +} +template <class D> +HWY_API Vec512<float> ConcatLowerUpper(D /* tag */, Vec512<float> hi, + Vec512<float> lo) { + return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; +} +template <class D> +HWY_API Vec512<double> ConcatLowerUpper(D /* tag */, Vec512<double> hi, + Vec512<double> lo) { + return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)}; +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> ConcatUpperLower(D /* tag */, Vec512<T> hi, Vec512<T> lo) { + // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks + // are efficiently loaded from 32-bit regs. + const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF); + return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)}; +} +template <class D> +HWY_API Vec512<float> ConcatUpperLower(D /* tag */, Vec512<float> hi, + Vec512<float> lo) { + const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF); + return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)}; +} +template <class D> +HWY_API Vec512<double> ConcatUpperLower(D /* tag */, Vec512<double> hi, + Vec512<double> lo) { + const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F); + return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)}; +} + +// ------------------------------ ConcatOdd + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> ConcatOdd(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(64) static constexpr uint8_t kIdx[64] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, + 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, + 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, + 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, + 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127}; + return BitCast( + d, Vec512<uint8_t>{_mm512_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RepartitionToWide<decltype(du)> dw; + // Right-shift 8 bits per u16 so we can pack. + const Vec512<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec512<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); + const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)}; + // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes. + const Full512<uint64_t> du64; + alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx))); +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> ConcatOdd(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint16_t kIdx[32] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; + return BitCast( + d, Vec512<uint16_t>{_mm512_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> ConcatOdd(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint32_t kIdx[16] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + return BitCast( + d, Vec512<uint32_t>{_mm512_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +} + +template <class D> +HWY_API Vec512<float> ConcatOdd(D d, Vec512<float> hi, Vec512<float> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint32_t kIdx[16] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + return Vec512<float>{ + _mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> ConcatOdd(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return BitCast( + d, Vec512<uint64_t>{_mm512_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +} + +template <class D> +HWY_API Vec512<double> ConcatOdd(D d, Vec512<double> hi, Vec512<double> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return Vec512<double>{ + _mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; +} + +// ------------------------------ ConcatEven + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> ConcatEven(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(64) static constexpr uint8_t kIdx[64] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, + 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, + 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, + 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126}; + return BitCast( + d, Vec512<uint32_t>{_mm512_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +#else + const RepartitionToWide<decltype(du)> dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec512<uint16_t> mask = Set(dw, 0x00FF); + const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask); + const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask); + const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)}; + // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes. + const Full512<uint64_t> du64; + alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx))); +#endif +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> ConcatEven(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint16_t kIdx[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + return BitCast( + d, Vec512<uint32_t>{_mm512_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> ConcatEven(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint32_t kIdx[16] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; + return BitCast( + d, Vec512<uint32_t>{_mm512_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +} + +template <class D> +HWY_API Vec512<float> ConcatEven(D d, Vec512<float> hi, Vec512<float> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint32_t kIdx[16] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; + return Vec512<float>{ + _mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> ConcatEven(D d, Vec512<T> hi, Vec512<T> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return BitCast( + d, Vec512<uint64_t>{_mm512_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); +} + +template <class D> +HWY_API Vec512<double> ConcatEven(D d, Vec512<double> hi, Vec512<double> lo) { + const RebindToUnsigned<decltype(d)> du; + alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return Vec512<double>{ + _mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; +} + +// ------------------------------ DupEven (InterleaveLower) + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> DupEven(Vec512<T> v) { + return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)}; +} +HWY_API Vec512<float> DupEven(Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> DupEven(const Vec512<T> v) { + const DFromV<decltype(v)> d; + return InterleaveLower(d, v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template <typename T, HWY_IF_T_SIZE(T, 4)> +HWY_API Vec512<T> DupOdd(Vec512<T> v) { + return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)}; +} +HWY_API Vec512<float> DupOdd(Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)}; +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> DupOdd(const Vec512<T> v) { + const DFromV<decltype(v)> d; + return InterleaveUpper(d, v, v); +} + +// ------------------------------ OddEven (IfThenElse) + +template <typename T> +HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) { + constexpr size_t s = sizeof(T); + constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56; + return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a); +} + +// ------------------------------ OddEvenBlocks + +template <typename T> +HWY_API Vec512<T> OddEvenBlocks(Vec512<T> odd, Vec512<T> even) { + return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)}; +} + +HWY_API Vec512<float> OddEvenBlocks(Vec512<float> odd, Vec512<float> even) { + return Vec512<float>{ + _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)}; +} + +HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) { + return Vec512<double>{ + _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)}; +} + +// ------------------------------ SwapAdjacentBlocks + +template <typename T> +HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) { + return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) { + return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +// ------------------------------ ReverseBlocks + +template <class D, typename T = TFromD<D>> +HWY_API Vec512<T> ReverseBlocks(D /* tag */, Vec512<T> v) { + return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)}; +} +template <class D> +HWY_API Vec512<float> ReverseBlocks(D /* tag */, Vec512<float> v) { + return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)}; +} +template <class D> +HWY_API Vec512<double> ReverseBlocks(D /* tag */, Vec512<double> v) { + return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)}; +} + +// ------------------------------ TableLookupBytes (ZeroExtendVector) + +// Both full +template <typename T, typename TI> +HWY_API Vec512<TI> TableLookupBytes(Vec512<T> bytes, Vec512<TI> indices) { + return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)}; +} + +// Partial index vector +template <typename T, typename TI, size_t NI> +HWY_API Vec128<TI, NI> TableLookupBytes(Vec512<T> bytes, Vec128<TI, NI> from) { + const Full512<TI> d512; + const Half<decltype(d512)> d256; + const Half<decltype(d256)> d128; + // First expand to full 128, then 256, then 512. + const Vec128<TI> from_full{from.raw}; + const auto from_512 = + ZeroExtendVector(d512, ZeroExtendVector(d256, from_full)); + const auto tbl_full = TableLookupBytes(bytes, from_512); + // Shrink to 256, then 128, then partial. + return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw}; +} +template <typename T, typename TI> +HWY_API Vec256<TI> TableLookupBytes(Vec512<T> bytes, Vec256<TI> from) { + const DFromV<decltype(from)> dih; + const Twice<decltype(dih)> di; + const auto from_512 = ZeroExtendVector(di, from); + return LowerHalf(dih, TableLookupBytes(bytes, from_512)); +} + +// Partial table vector +template <typename T, size_t N, typename TI> +HWY_API Vec512<TI> TableLookupBytes(Vec128<T, N> bytes, Vec512<TI> from) { + const DFromV<decltype(from)> d512; + const Half<decltype(d512)> d256; + const Half<decltype(d256)> d128; + // First expand to full 128, then 256, then 512. + const Vec128<T> bytes_full{bytes.raw}; + const auto bytes_512 = + ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full)); + return TableLookupBytes(bytes_512, from); +} +template <typename T, typename TI> +HWY_API Vec512<TI> TableLookupBytes(Vec256<T> bytes, Vec512<TI> from) { + const Full512<T> d; + return TableLookupBytes(ZeroExtendVector(d, bytes), from); +} + +// Partial both are handled by x86_128/256. + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then Zip* would be faster. +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec512<uint16_t> PromoteTo(D /* tag */, Vec256<uint8_t> v) { + return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)}; +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec512<uint32_t> PromoteTo(D /* tag */, Vec128<uint8_t> v) { + return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)}; +} +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec512<uint32_t> PromoteTo(D /* tag */, Vec256<uint16_t> v) { + return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec512<uint64_t> PromoteTo(D /* tag */, Vec256<uint32_t> v) { + return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec512<uint64_t> PromoteTo(D /* tag */, Vec128<uint16_t> v) { + return Vec512<uint64_t>{_mm512_cvtepu16_epi64(v.raw)}; +} +template <class D, HWY_IF_U64_D(D)> +HWY_API Vec512<uint64_t> PromoteTo(D /* tag */, Vec64<uint8_t> v) { + return Vec512<uint64_t>{_mm512_cvtepu8_epi64(v.raw)}; +} + +// Signed: replicate sign bit. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by +// signed shift would be faster. +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec512<int16_t> PromoteTo(D /* tag */, Vec256<int8_t> v) { + return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec512<int32_t> PromoteTo(D /* tag */, Vec128<int8_t> v) { + return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)}; +} +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec512<int32_t> PromoteTo(D /* tag */, Vec256<int16_t> v) { + return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec512<int64_t> PromoteTo(D /* tag */, Vec256<int32_t> v) { + return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec512<int64_t> PromoteTo(D /* tag */, Vec128<int16_t> v) { + return Vec512<int64_t>{_mm512_cvtepi16_epi64(v.raw)}; +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec512<int64_t> PromoteTo(D /* tag */, Vec64<int8_t> v) { + return Vec512<int64_t>{_mm512_cvtepi8_epi64(v.raw)}; +} + +// Float +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec512<float> PromoteTo(D /* tag */, Vec256<float16_t> v) { + return Vec512<float>{_mm512_cvtph_ps(v.raw)}; +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec512<float> PromoteTo(D df32, Vec256<bfloat16_t> v) { + const Rebind<uint16_t, decltype(df32)> du16; + const RebindToSigned<decltype(df32)> di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec512<double> PromoteTo(D /* tag */, Vec256<float> v) { + return Vec512<double>{_mm512_cvtps_pd(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec512<double> PromoteTo(D /* tag */, Vec256<int32_t> v) { + return Vec512<double>{_mm512_cvtepi32_pd(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec256<uint16_t> DemoteTo(D /* tag */, Vec512<int32_t> v) { + const Full512<uint64_t> du64; + const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(du64, kLanes); + const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)}; + return LowerHalf(even); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec256<uint16_t> DemoteTo(D dn, Vec512<uint32_t> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec256<int16_t> DemoteTo(D /* tag */, Vec512<int32_t> v) { + const Full512<uint64_t> du64; + const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(du64, kLanes); + const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)}; + return LowerHalf(even); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> DemoteTo(D /* tag */, Vec512<int32_t> v) { + const Full512<uint32_t> du32; + const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)}; + const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)}; + + alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12}; + const auto idx32 = LoadDup128(du32, kLanes); + const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)}; + return LowerHalf(LowerHalf(fixed)); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> DemoteTo(D /* tag */, Vec512<uint32_t> v) { + return Vec128<uint8_t>{_mm512_cvtusepi32_epi8(v.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec256<uint8_t> DemoteTo(D /* tag */, Vec512<int16_t> v) { + const Full512<uint64_t> du64; + const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(du64, kLanes); + const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)}; + return LowerHalf(even); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec256<uint8_t> DemoteTo(D dn, Vec512<uint16_t> v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu)))); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec128<int8_t> DemoteTo(D /* tag */, Vec512<int32_t> v) { + const Full512<uint32_t> du32; + const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)}; + const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)}; + + alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12, + 0, 4, 8, 12, 0, 4, 8, 12}; + const auto idx32 = LoadDup128(du32, kLanes); + const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)}; + return LowerHalf(LowerHalf(fixed)); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec256<int8_t> DemoteTo(D /* tag */, Vec512<int16_t> v) { + const Full512<uint64_t> du64; + const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(du64, kLanes); + const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)}; + return LowerHalf(even); +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec256<int32_t> DemoteTo(D /* tag */, Vec512<int64_t> v) { + return Vec256<int32_t>{_mm512_cvtsepi64_epi32(v.raw)}; +} +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec128<int16_t> DemoteTo(D /* tag */, Vec512<int64_t> v) { + return Vec128<int16_t>{_mm512_cvtsepi64_epi16(v.raw)}; +} +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec512<int64_t> v) { + return Vec64<int8_t>{_mm512_cvtsepi64_epi8(v.raw)}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> DemoteTo(D /* tag */, Vec512<int64_t> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return Vec256<uint32_t>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> DemoteTo(D /* tag */, Vec512<int64_t> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return Vec128<uint16_t>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec512<int64_t> v) { + const auto neg_mask = MaskFromVec(v); +#if HWY_COMPILER_HAS_MASK_INTRINSICS + const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); +#else + const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); +#endif + return Vec64<uint8_t>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> DemoteTo(D /* tag */, Vec512<uint64_t> v) { + return Vec256<uint32_t>{_mm512_cvtusepi64_epi32(v.raw)}; +} +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> DemoteTo(D /* tag */, Vec512<uint64_t> v) { + return Vec128<uint16_t>{_mm512_cvtusepi64_epi16(v.raw)}; +} +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec512<uint64_t> v) { + return Vec64<uint8_t>{_mm512_cvtusepi64_epi8(v.raw)}; +} + +template <class D, HWY_IF_F16_D(D)> +HWY_API Vec256<float16_t> DemoteTo(D /* tag */, Vec512<float> v) { + // Work around warnings in the intrinsic definitions (passing -1 as a mask). + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; + HWY_DIAGNOSTICS(pop) +} + +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec256<bfloat16_t> DemoteTo(D dbf16, Vec512<float> v) { + // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16. + const Rebind<int32_t, decltype(dbf16)> di32; + const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right + const Rebind<uint16_t, decltype(dbf16)> du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +template <class D, HWY_IF_BF16_D(D)> +HWY_API Vec512<bfloat16_t> ReorderDemote2To(D dbf16, Vec512<float> a, + Vec512<float> b) { + // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16. + const RebindToUnsigned<decltype(dbf16)> du16; + const Repartition<uint32_t, decltype(dbf16)> du32; + const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +template <class D, HWY_IF_I16_D(D)> +HWY_API Vec512<int16_t> ReorderDemote2To(D /* tag */, Vec512<int32_t> a, + Vec512<int32_t> b) { + return Vec512<int16_t>{_mm512_packs_epi32(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec512<uint16_t> ReorderDemote2To(D /* tag */, Vec512<int32_t> a, + Vec512<int32_t> b) { + return Vec512<uint16_t>{_mm512_packus_epi32(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec512<uint16_t> ReorderDemote2To(D dn, Vec512<uint32_t> a, + Vec512<uint32_t> b) { + const DFromV<decltype(a)> du32; + const RebindToSigned<decltype(du32)> di32; + const auto max_i32 = Set(du32, 0x7FFFFFFFu); + + return ReorderDemote2To(dn, BitCast(di32, Min(a, max_i32)), + BitCast(di32, Min(b, max_i32))); +} + +template <class D, HWY_IF_I8_D(D)> +HWY_API Vec512<int8_t> ReorderDemote2To(D /* tag */, Vec512<int16_t> a, + Vec512<int16_t> b) { + return Vec512<int8_t>{_mm512_packs_epi16(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec512<uint8_t> ReorderDemote2To(D /* tag */, Vec512<int16_t> a, + Vec512<int16_t> b) { + return Vec512<uint8_t>{_mm512_packus_epi16(a.raw, b.raw)}; +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec512<uint8_t> ReorderDemote2To(D dn, Vec512<uint16_t> a, + Vec512<uint16_t> b) { + const DFromV<decltype(a)> du16; + const RebindToSigned<decltype(du16)> di16; + const auto max_i16 = Set(du16, 0x7FFFu); + + return ReorderDemote2To(dn, BitCast(di16, Min(a, max_i16)), + BitCast(di16, Min(b, max_i16))); +} + +template <class D, HWY_IF_T_SIZE_D(D, 4), + HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)> +HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<int64_t> a, Vec512<int64_t> b) { + const Half<decltype(dn)> dnh; + return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec512<uint32_t> ReorderDemote2To(D dn, Vec512<uint64_t> a, + Vec512<uint64_t> b) { + const Half<decltype(dn)> dnh; + return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); +} + +template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), + HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + const Full512<uint64_t> du64; + alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + return BitCast(d, TableLookupLanes(BitCast(du64, ReorderDemote2To(d, a, b)), + SetTableIndices(du64, kIdx))); +} + +template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), + HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2), + HWY_IF_T_SIZE_V(V, 8)> +HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { + return ReorderDemote2To(d, a, b); +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec256<float> DemoteTo(D /* tag */, Vec512<double> v) { + return Vec256<float>{_mm512_cvtpd_ps(v.raw)}; +} + +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec256<int32_t> DemoteTo(D /* tag */, Vec512<double> v) { + const DFromV<decltype(v)> d; + const Vec512<double> clamped = detail::ClampF64ToI32Max(d, v); + return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)}; +} + +// For already range-limited input [0, 255]. +HWY_API Vec128<uint8_t> U8FromU32(const Vec512<uint32_t> v) { + const DFromV<decltype(v)> d32; + // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the + // lowest 4 bytes. + alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u, + ~0u}; + const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32)); + // Gather the lowest 4 bytes of 4 128-bit blocks. + alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12}; + const Vec512<uint8_t> bytes{ + _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)}; + return LowerHalf(LowerHalf(bytes)); +} + +// ------------------------------ Truncations + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec64<uint8_t> TruncateTo(D d, const Vec512<uint64_t> v) { +#if HWY_TARGET <= HWY_AVX3_DL + (void)d; + const Full512<uint8_t> d8; + alignas(16) static constexpr uint8_t k8From64[16] = { + 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56}; + const Vec512<uint8_t> bytes{ + _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)}; + return LowerHalf(LowerHalf(LowerHalf(bytes))); +#else + const Full512<uint32_t> d32; + alignas(64) static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 0, 2, 4, 6, 8, 10, 12, 14}; + const Vec512<uint32_t> even{ + _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)}; + return TruncateTo(d, LowerHalf(even)); +#endif +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec128<uint16_t> TruncateTo(D /* tag */, const Vec512<uint64_t> v) { + const Full512<uint16_t> d16; + alignas(16) static constexpr uint16_t k16From64[8] = {0, 4, 8, 12, + 16, 20, 24, 28}; + const Vec512<uint16_t> bytes{ + _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)}; + return LowerHalf(LowerHalf(bytes)); +} + +template <class D, HWY_IF_U32_D(D)> +HWY_API Vec256<uint32_t> TruncateTo(D /* tag */, const Vec512<uint64_t> v) { + const Full512<uint32_t> d32; + alignas(64) static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 0, 2, 4, 6, 8, 10, 12, 14}; + const Vec512<uint32_t> even{ + _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)}; + return LowerHalf(even); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec128<uint8_t> TruncateTo(D /* tag */, const Vec512<uint32_t> v) { +#if HWY_TARGET <= HWY_AVX3_DL + const Full512<uint8_t> d8; + alignas(16) static constexpr uint8_t k8From32[16] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}; + const Vec512<uint8_t> bytes{ + _mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)}; +#else + const Full512<uint32_t> d32; + // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the + // lowest 4 bytes. + alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u, + ~0u}; + const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32)); + // Gather the lowest 4 bytes of 4 128-bit blocks. + alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12}; + const Vec512<uint8_t> bytes{ + _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)}; +#endif + return LowerHalf(LowerHalf(bytes)); +} + +template <class D, HWY_IF_U16_D(D)> +HWY_API Vec256<uint16_t> TruncateTo(D /* tag */, const Vec512<uint32_t> v) { + const Full512<uint16_t> d16; + alignas(64) static constexpr uint16_t k16From32[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; + const Vec512<uint16_t> bytes{ + _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)}; + return LowerHalf(bytes); +} + +template <class D, HWY_IF_U8_D(D)> +HWY_API Vec256<uint8_t> TruncateTo(D /* tag */, const Vec512<uint16_t> v) { +#if HWY_TARGET <= HWY_AVX3_DL + const Full512<uint8_t> d8; + alignas(64) static constexpr uint8_t k8From16[64] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + const Vec512<uint8_t> bytes{ + _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)}; +#else + const Full512<uint32_t> d32; + alignas(16) static constexpr uint32_t k16From32[4] = { + 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u}; + const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32)); + alignas(64) static constexpr uint32_t kIndex32[16] = { + 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; + const Vec512<uint8_t> bytes{ + _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)}; +#endif + return LowerHalf(bytes); +} + +// ------------------------------ Convert integer <=> floating point + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec512<float> ConvertTo(D /* tag */, Vec512<int32_t> v) { + return Vec512<float>{_mm512_cvtepi32_ps(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec512<double> ConvertTo(D /* tag */, Vec512<int64_t> v) { + return Vec512<double>{_mm512_cvtepi64_pd(v.raw)}; +} + +template <class D, HWY_IF_F32_D(D)> +HWY_API Vec512<float> ConvertTo(D /* tag*/, Vec512<uint32_t> v) { + return Vec512<float>{_mm512_cvtepu32_ps(v.raw)}; +} + +template <class D, HWY_IF_F64_D(D)> +HWY_API Vec512<double> ConvertTo(D /* tag*/, Vec512<uint64_t> v) { + return Vec512<double>{_mm512_cvtepu64_pd(v.raw)}; +} + +// Truncates (rounds toward zero). +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec512<int32_t> ConvertTo(D d, Vec512<float> v) { + return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw)); +} +template <class D, HWY_IF_I64_D(D)> +HWY_API Vec512<int64_t> ConvertTo(D di, Vec512<double> v) { + return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw)); +} + +HWY_API Vec512<int32_t> NearestInt(const Vec512<float> v) { + const RebindToSigned<DFromV<decltype(v)>> di; + return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw)); +} + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PCLMUL_AES) + +HWY_API Vec512<uint8_t> AESRound(Vec512<uint8_t> state, + Vec512<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)}; +#else + const DFromV<decltype(state)> d; + const Half<decltype(d)> d2; + return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec512<uint8_t> AESLastRound(Vec512<uint8_t> state, + Vec512<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)}; +#else + const DFromV<decltype(state)> d; + const Half<decltype(d)> d2; + return Combine(d, + AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESLastRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec512<uint8_t> AESRoundInv(Vec512<uint8_t> state, + Vec512<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<uint8_t>{_mm512_aesdec_epi128(state.raw, round_key.raw)}; +#else + const Full512<uint8_t> d; + const Half<decltype(d)> d2; + return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESRoundInv(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec512<uint8_t> AESLastRoundInv(Vec512<uint8_t> state, + Vec512<uint8_t> round_key) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<uint8_t>{_mm512_aesdeclast_epi128(state.raw, round_key.raw)}; +#else + const Full512<uint8_t> d; + const Half<decltype(d)> d2; + return Combine( + d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESLastRoundInv(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +template <uint8_t kRcon> +HWY_API Vec512<uint8_t> AESKeyGenAssist(Vec512<uint8_t> v) { + const Full512<uint8_t> d; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(16) static constexpr uint8_t kRconXorMask[16] = { + 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; + alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { + 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; + const Repartition<uint32_t, decltype(d)> du32; + const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); + const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask)); + return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle)); +#else + const Half<decltype(d)> d2; + return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)), + AESKeyGenAssist<kRcon>(LowerHalf(v))); +#endif +} + +HWY_API Vec512<uint64_t> CLMulLower(Vec512<uint64_t> va, Vec512<uint64_t> vb) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)}; +#else + alignas(64) uint64_t a[8]; + alignas(64) uint64_t b[8]; + const DFromV<decltype(va)> d; + const Half<Half<decltype(d)>> d128; + Store(va, d, a); + Store(vb, d, b); + for (size_t i = 0; i < 8; i += 2) { + const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i)); + Store(mul, d128, a + i); + } + return Load(d, a); +#endif +} + +HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) { +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)}; +#else + alignas(64) uint64_t a[8]; + alignas(64) uint64_t b[8]; + const DFromV<decltype(va)> d; + const Half<Half<decltype(d)>> d128; + Store(va, d, a); + Store(vb, d, b); + for (size_t i = 0; i < 8; i += 2) { + const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i)); + Store(mul, d128, a + i); + } + return Load(d, a); +#endif +} + +#endif // HWY_DISABLE_PCLMUL_AES + +// ================================================== MISC + +// ------------------------------ I32/I64 SaturatedAdd (MaskFromVec) + +HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) { + const DFromV<decltype(a)> d; + const auto sum = a + b; + const auto overflow_mask = MaskFromVec( + Vec512<int32_t>{_mm512_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); + const auto i32_max = Set(d, LimitsMax<int32_t>()); + const Vec512<int32_t> overflow_result{_mm512_mask_ternarylogic_epi32( + i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, sum); +} + +HWY_API Vec512<int64_t> SaturatedAdd(Vec512<int64_t> a, Vec512<int64_t> b) { + const DFromV<decltype(a)> d; + const auto sum = a + b; + const auto overflow_mask = MaskFromVec( + Vec512<int64_t>{_mm512_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); + const auto i64_max = Set(d, LimitsMax<int64_t>()); + const Vec512<int64_t> overflow_result{_mm512_mask_ternarylogic_epi64( + i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, sum); +} + +// ------------------------------ I32/I64 SaturatedSub (MaskFromVec) + +HWY_API Vec512<int32_t> SaturatedSub(Vec512<int32_t> a, Vec512<int32_t> b) { + const DFromV<decltype(a)> d; + const auto diff = a - b; + const auto overflow_mask = MaskFromVec( + Vec512<int32_t>{_mm512_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); + const auto i32_max = Set(d, LimitsMax<int32_t>()); + const Vec512<int32_t> overflow_result{_mm512_mask_ternarylogic_epi32( + i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, diff); +} + +HWY_API Vec512<int64_t> SaturatedSub(Vec512<int64_t> a, Vec512<int64_t> b) { + const DFromV<decltype(a)> d; + const auto diff = a - b; + const auto overflow_mask = MaskFromVec( + Vec512<int64_t>{_mm512_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); + const auto i64_max = Set(d, LimitsMax<int64_t>()); + const Vec512<int64_t> overflow_result{_mm512_mask_ternarylogic_epi64( + i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; + return IfThenElse(overflow_mask, overflow_result, diff); +} + +// ------------------------------ Mask testing + +// Beware: the suffix indicates the number of mask bits, not lane size! + +namespace detail { + +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask64_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template <typename T> +HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API bool AllFalse(D /* tag */, const Mask512<T> mask) { + return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask); +} + +namespace detail { + +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask64_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFFFFFFFFFFFFFull; +#endif +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFFFFFull; +#endif +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFull; +#endif +} +template <typename T> +HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFull; +#endif +} + +} // namespace detail + +template <class D, typename T = TFromD<D>> +HWY_API bool AllTrue(D /* tag */, const Mask512<T> mask) { + return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask); +} + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template <class D, HWY_IF_V_SIZE_D(D, 64), typename T = TFromD<D>> +HWY_API Mask512<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { + Mask512<T> mask; + CopyBytes<8 / sizeof(T)>(bits, &mask.raw); + // N >= 8 (= 512 / 64), so no need to mask invalid bits. + return mask; +} + +// `p` points to at least 8 writable bytes. +template <class D, typename T = TFromD<D>> +HWY_API size_t StoreMaskBits(D /* tag */, Mask512<T> mask, uint8_t* bits) { + const size_t kNumBytes = 8 / sizeof(T); + CopyBytes<kNumBytes>(&mask.raw, bits); + // N >= 8 (= 512 / 64), so no need to mask invalid bits. + return kNumBytes; +} + +template <class D, typename T = TFromD<D>> +HWY_API size_t CountTrue(D /* tag */, const Mask512<T> mask) { + return PopCount(static_cast<uint64_t>(mask.raw)); +} + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API size_t FindKnownFirstTrue(D /* tag */, Mask512<T> mask) { + return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API size_t FindKnownFirstTrue(D /* tag */, Mask512<T> mask) { + return Num0BitsBelowLS1Bit_Nonzero64(mask.raw); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindFirstTrue(D d, Mask512<T> mask) { + return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask)) + : intptr_t{-1}; +} + +template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 1)> +HWY_API size_t FindKnownLastTrue(D /* tag */, Mask512<T> mask) { + return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw); +} + +template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> +HWY_API size_t FindKnownLastTrue(D /* tag */, Mask512<T> mask) { + return 63 - Num0BitsAboveMS1Bit_Nonzero64(mask.raw); +} + +template <class D, typename T = TFromD<D>> +HWY_API intptr_t FindLastTrue(D d, Mask512<T> mask) { + return mask.raw ? static_cast<intptr_t>(FindKnownLastTrue(d, mask)) + : intptr_t{-1}; +} + +// ------------------------------ Compress + +// Always implement 8-bit here even if we lack VBMI2 because we can do better +// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time). +#ifdef HWY_NATIVE_COMPRESS8 +#undef HWY_NATIVE_COMPRESS8 +#else +#define HWY_NATIVE_COMPRESS8 +#endif + +namespace detail { + +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 +template <size_t N> +HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v, + const Mask128<uint8_t, N> mask) { + return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)}; +} +HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v, + const Mask256<uint8_t> mask) { + return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)}; +} +HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v, + const Mask512<uint8_t> mask) { + return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)}; +} + +template <size_t N> +HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v, + const Mask128<uint16_t, N> mask) { + return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)}; +} +HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v, + const Mask256<uint16_t> mask) { + return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)}; +} +HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v, + const Mask512<uint16_t> mask) { + return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)}; +} + +// Slow on Zen4, do not even define these to prevent accidental usage. +#if HWY_TARGET != HWY_AVX3_ZEN4 + +template <size_t N> +HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v, + Mask128<uint8_t, N> mask, + uint8_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask, + uint8_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask, + uint8_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); +} + +template <size_t N> +HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v, + Mask128<uint16_t, N> mask, + uint16_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask, + uint16_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask, + uint16_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); +} + +#endif // HWY_TARGET != HWY_AVX3_ZEN4 + +HWY_INLINE Vec512<uint8_t> NativeExpand(Vec512<uint8_t> v, + Mask512<uint8_t> mask) { + return Vec512<uint8_t>{_mm512_maskz_expand_epi8(mask.raw, v.raw)}; +} + +HWY_INLINE Vec512<uint16_t> NativeExpand(Vec512<uint16_t> v, + Mask512<uint16_t> mask) { + return Vec512<uint16_t>{_mm512_maskz_expand_epi16(mask.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)> +HWY_INLINE Vec512<uint8_t> NativeLoadExpand( + Mask512<uint8_t> mask, D /* d */, const uint8_t* HWY_RESTRICT unaligned) { + return Vec512<uint8_t>{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)> +HWY_INLINE Vec512<uint16_t> NativeLoadExpand( + Mask512<uint16_t> mask, D /* d */, const uint16_t* HWY_RESTRICT unaligned) { + return Vec512<uint16_t>{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)}; +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +template <size_t N> +HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v, + Mask128<uint32_t, N> mask) { + return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)}; +} +HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v, + Mask256<uint32_t> mask) { + return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; +} +HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v, + Mask512<uint32_t> mask) { + return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; +} +// We use table-based compress for 64-bit lanes, see CompressIsPartition. + +// Slow on Zen4, do not even define these to prevent accidental usage. +#if HWY_TARGET != HWY_AVX3_ZEN4 + +template <size_t N> +HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v, + Mask128<uint32_t, N> mask, + uint32_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask, + uint32_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask, + uint32_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); +} + +template <size_t N> +HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v, + Mask128<uint64_t, N> mask, + uint64_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask, + uint64_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask, + uint64_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); +} + +template <size_t N> +HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask, + float* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask, + float* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask, + float* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); +} + +template <size_t N> +HWY_INLINE void NativeCompressStore(Vec128<double, N> v, + Mask128<double, N> mask, + double* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask, + double* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask, + double* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); +} + +#endif // HWY_TARGET != HWY_AVX3_ZEN4 + +HWY_INLINE Vec512<uint32_t> NativeExpand(Vec512<uint32_t> v, + Mask512<uint32_t> mask) { + return Vec512<uint32_t>{_mm512_maskz_expand_epi32(mask.raw, v.raw)}; +} + +HWY_INLINE Vec512<uint64_t> NativeExpand(Vec512<uint64_t> v, + Mask512<uint64_t> mask) { + return Vec512<uint64_t>{_mm512_maskz_expand_epi64(mask.raw, v.raw)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)> +HWY_INLINE Vec512<uint32_t> NativeLoadExpand( + Mask512<uint32_t> mask, D /* d */, const uint32_t* HWY_RESTRICT unaligned) { + return Vec512<uint32_t>{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)}; +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)> +HWY_INLINE Vec512<uint64_t> NativeLoadExpand( + Mask512<uint64_t> mask, D /* d */, const uint64_t* HWY_RESTRICT unaligned) { + return Vec512<uint64_t>{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)}; +} + +// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is +// only a single compressed vector (u32x16). Other EmuCompress are implemented +// after the EmuCompressStore they build upon. +template <size_t N> +HWY_INLINE Vec128<uint8_t, N> EmuCompress(Vec128<uint8_t, N> v, + Mask128<uint8_t, N> mask) { + const DFromV<decltype(v)> d; + const Rebind<uint32_t, decltype(d)> d32; + const VFromD<decltype(d32)> v0 = PromoteTo(d32, v); + + const uint64_t mask_bits{mask.raw}; + // Mask type is __mmask16 if v is full 128, else __mmask8. + using M32 = MFromD<decltype(d32)>; + const M32 m0{static_cast<typename M32::Raw>(mask_bits)}; + return TruncateTo(d, Compress(v0, m0)); +} + +template <size_t N> +HWY_INLINE Vec128<uint16_t, N> EmuCompress(Vec128<uint16_t, N> v, + Mask128<uint16_t, N> mask) { + const DFromV<decltype(v)> d; + const Rebind<int32_t, decltype(d)> di32; + const RebindToUnsigned<decltype(di32)> du32; + const MFromD<decltype(du32)> mask32{static_cast<__mmask8>(mask.raw)}; + // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX. + // Only i32 -> u16 is supported, whereas NativeCompress expects u32. + const VFromD<decltype(du32)> v32 = BitCast(du32, PromoteTo(di32, v)); + return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); +} + +HWY_INLINE Vec256<uint16_t> EmuCompress(Vec256<uint16_t> v, + Mask256<uint16_t> mask) { + const DFromV<decltype(v)> d; + const Rebind<int32_t, decltype(d)> di32; + const RebindToUnsigned<decltype(di32)> du32; + const Mask512<uint32_t> mask32{static_cast<__mmask16>(mask.raw)}; + const Vec512<uint32_t> v32 = BitCast(du32, PromoteTo(di32, v)); + return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); +} + +// See above - small-vector EmuCompressStore are implemented via EmuCompress. +template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> +HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + StoreU(EmuCompress(v, mask), d, unaligned); +} + +template <class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_D(D, 32)> +HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + StoreU(EmuCompress(v, mask), d, unaligned); +} + +// Main emulation logic for wider vector, starting with EmuCompressStore because +// it is most convenient to merge pieces using memory (concatenating vectors at +// byte offsets is difficult). +template <class D> +HWY_INLINE void EmuCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask, D d, + uint8_t* HWY_RESTRICT unaligned) { + const uint64_t mask_bits{mask.raw}; + const Half<decltype(d)> dh; + const Rebind<uint32_t, decltype(dh)> d32; + const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(v)); + const Vec512<uint32_t> v1 = PromoteTo(d32, UpperHalf(dh, v)); + const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)}; + const Mask512<uint32_t> m1{static_cast<__mmask16>(mask_bits >> 16)}; + const Vec128<uint8_t> c0 = TruncateTo(dh, NativeCompress(v0, m0)); + const Vec128<uint8_t> c1 = TruncateTo(dh, NativeCompress(v1, m1)); + uint8_t* HWY_RESTRICT pos = unaligned; + StoreU(c0, dh, pos); + StoreU(c1, dh, pos + CountTrue(d32, m0)); +} + +template <class D> +HWY_INLINE void EmuCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask, D d, + uint8_t* HWY_RESTRICT unaligned) { + const uint64_t mask_bits{mask.raw}; + const Half<Half<decltype(d)>> dq; + const Rebind<uint32_t, decltype(dq)> d32; + alignas(64) uint8_t lanes[64]; + Store(v, d, lanes); + const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(LowerHalf(v))); + const Vec512<uint32_t> v1 = PromoteTo(d32, Load(dq, lanes + 16)); + const Vec512<uint32_t> v2 = PromoteTo(d32, Load(dq, lanes + 32)); + const Vec512<uint32_t> v3 = PromoteTo(d32, Load(dq, lanes + 48)); + const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)}; + const Mask512<uint32_t> m1{ + static_cast<uint16_t>((mask_bits >> 16) & 0xFFFFu)}; + const Mask512<uint32_t> m2{ + static_cast<uint16_t>((mask_bits >> 32) & 0xFFFFu)}; + const Mask512<uint32_t> m3{static_cast<__mmask16>(mask_bits >> 48)}; + const Vec128<uint8_t> c0 = TruncateTo(dq, NativeCompress(v0, m0)); + const Vec128<uint8_t> c1 = TruncateTo(dq, NativeCompress(v1, m1)); + const Vec128<uint8_t> c2 = TruncateTo(dq, NativeCompress(v2, m2)); + const Vec128<uint8_t> c3 = TruncateTo(dq, NativeCompress(v3, m3)); + uint8_t* HWY_RESTRICT pos = unaligned; + StoreU(c0, dq, pos); + pos += CountTrue(d32, m0); + StoreU(c1, dq, pos); + pos += CountTrue(d32, m1); + StoreU(c2, dq, pos); + pos += CountTrue(d32, m2); + StoreU(c3, dq, pos); +} + +template <class D> +HWY_INLINE void EmuCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask, + D d, uint16_t* HWY_RESTRICT unaligned) { + const Repartition<int32_t, decltype(d)> di32; + const RebindToUnsigned<decltype(di32)> du32; + const Half<decltype(d)> dh; + const Vec512<uint32_t> promoted0 = + BitCast(du32, PromoteTo(di32, LowerHalf(dh, v))); + const Vec512<uint32_t> promoted1 = + BitCast(du32, PromoteTo(di32, UpperHalf(dh, v))); + + const uint64_t mask_bits{mask.raw}; + const uint64_t maskL = mask_bits & 0xFFFF; + const uint64_t maskH = mask_bits >> 16; + const Mask512<uint32_t> mask0{static_cast<__mmask16>(maskL)}; + const Mask512<uint32_t> mask1{static_cast<__mmask16>(maskH)}; + const Vec512<uint32_t> compressed0 = NativeCompress(promoted0, mask0); + const Vec512<uint32_t> compressed1 = NativeCompress(promoted1, mask1); + + const Vec256<uint16_t> demoted0 = DemoteTo(dh, BitCast(di32, compressed0)); + const Vec256<uint16_t> demoted1 = DemoteTo(dh, BitCast(di32, compressed1)); + + // Store 256-bit halves + StoreU(demoted0, dh, unaligned); + StoreU(demoted1, dh, unaligned + PopCount(maskL)); +} + +// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore. +template <typename T> // 1 or 2 bytes +HWY_INLINE Vec512<T> EmuCompress(Vec512<T> v, Mask512<T> mask) { + const DFromV<decltype(v)> d; + alignas(64) T buf[2 * 64 / sizeof(T)]; + EmuCompressStore(v, mask, d, buf); + return Load(d, buf); +} + +HWY_INLINE Vec256<uint8_t> EmuCompress(Vec256<uint8_t> v, + const Mask256<uint8_t> mask) { + const DFromV<decltype(v)> d; + alignas(32) uint8_t buf[2 * 32 / sizeof(uint8_t)]; + EmuCompressStore(v, mask, d, buf); + return Load(d, buf); +} + +} // namespace detail + +template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API V Compress(V v, const M mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto mu = RebindMask(du, mask); +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); +#else + return BitCast(d, detail::EmuCompress(BitCast(du, v), mu)); +#endif +} + +template <class V, class M, HWY_IF_T_SIZE_V(V, 4)> +HWY_API V Compress(V v, const M mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto mu = RebindMask(du, mask); + return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) { + // See CompressIsPartition. u64 is faster than u32. + alignas(16) static constexpr uint64_t packed_array[256] = { + // From PrintCompress32x8Tables, without the FirstN extension (there is + // no benefit to including them because 64-bit CompressStore is anyway + // masked, but also no harm because TableLookupLanes ignores the MSB). + 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120, + 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310, + 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140, + 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210, + 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320, + 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510, + 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530, + 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210, + 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420, + 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310, + 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160, + 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210, + 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320, + 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410, + 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430, + 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210, + 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520, + 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310, + 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540, + 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210, + 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320, + 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710, + 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730, + 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210, + 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420, + 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310, + 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750, + 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210, + 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320, + 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410, + 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430, + 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210, + 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620, + 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310, + 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640, + 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210, + 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320, + 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510, + 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530, + 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210, + 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420, + 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310, + 0x10765432, 0x17654320, 0x07654321, 0x76543210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 3) - + // _mm512_permutexvar_epi64 will ignore the upper bits. + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) static constexpr uint64_t shifts[8] = {0, 4, 8, 12, + 16, 20, 24, 28}; + const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// ------------------------------ Expand + +template <typename T, HWY_IF_T_SIZE(T, 1)> +HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) { + const Full512<T> d; +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + const RebindToUnsigned<decltype(d)> du; + const auto mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +#else + // LUTs are infeasible for 2^64 possible masks, so splice together two + // half-vector Expand. + const Full256<T> dh; + constexpr size_t N = 64 / sizeof(T); + // We have to shift the input by a variable number of u8. Shuffling requires + // VBMI2, in which case we would already have NativeExpand. We instead + // load at an offset, which may incur a store to load forwarding stall. + alignas(64) T lanes[N]; + Store(v, d, lanes); + using Bits = typename Mask256<T>::Raw; + const Mask256<T> maskL{ + static_cast<Bits>(mask.raw & Bits{(1ULL << (N / 2)) - 1})}; + const Mask256<T> maskH{static_cast<Bits>(mask.raw >> (N / 2))}; + const size_t countL = CountTrue(dh, maskL); + const Vec256<T> expandL = Expand(LowerHalf(v), maskL); + const Vec256<T> expandH = Expand(LoadU(dh, lanes + countL), maskH); + return Combine(d, expandH, expandL); +#endif +} + +template <typename T, HWY_IF_T_SIZE(T, 2)> +HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) { + const Full512<T> d; + const RebindToUnsigned<decltype(d)> du; + const Vec512<uint16_t> vu = BitCast(du, v); +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + return BitCast(d, detail::NativeExpand(vu, RebindMask(du, mask))); +#else // AVX3 + // LUTs are infeasible for 2^32 possible masks, so splice together two + // half-vector Expand. + const Full256<T> dh; + constexpr size_t N = 64 / sizeof(T); + using Bits = typename Mask256<T>::Raw; + const Mask256<T> maskL{ + static_cast<Bits>(mask.raw & Bits{(1ULL << (N / 2)) - 1})}; + const Mask256<T> maskH{static_cast<Bits>(mask.raw >> (N / 2))}; + // In AVX3 we can permutevar, which avoids a potential store to load + // forwarding stall vs. reloading the input. + alignas(64) uint16_t iota[64] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + const Vec512<uint16_t> indices = LoadU(du, iota + CountTrue(dh, maskL)); + const Vec512<uint16_t> shifted{_mm512_permutexvar_epi16(indices.raw, vu.raw)}; + const Vec256<T> expandL = Expand(LowerHalf(v), maskL); + const Vec256<T> expandH = Expand(LowerHalf(BitCast(d, shifted)), maskH); + return Combine(d, expandH, expandL); +#endif // AVX3 +} + +template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> +HWY_API V Expand(V v, const M mask) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + const auto mu = RebindMask(du, mask); + return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); +} + +// For smaller vectors, it is likely more efficient to promote to 32-bit. +// This works for u8x16, u16x8, u16x16 (can be promoted to u32x16), but is +// unnecessary if HWY_AVX3_DL, which provides native instructions. +#if HWY_TARGET > HWY_AVX3_DL // no VBMI2 + +template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), + HWY_IF_LANES_LE_D(DFromV<V>, 16)> +HWY_API V Expand(V v, M mask) { + const DFromV<V> d; + const RebindToUnsigned<decltype(d)> du; + const Rebind<uint32_t, decltype(d)> du32; + const VFromD<decltype(du)> vu = BitCast(du, v); + using M32 = MFromD<decltype(du32)>; + const M32 m32{static_cast<typename M32::Raw>(mask.raw)}; + return BitCast(d, TruncateTo(du, Expand(PromoteTo(du32, vu), m32))); +} + +#endif // HWY_TARGET > HWY_AVX3_DL + +// ------------------------------ LoadExpand + +template <class D, HWY_IF_V_SIZE_D(D, 64), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); +#else + return Expand(LoadU(d, unaligned), mask); +#endif +} + +template <class D, HWY_IF_V_SIZE_D(D, 64), + HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> +HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, + const TFromD<D>* HWY_RESTRICT unaligned) { + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); + const MFromD<decltype(du)> mu = RebindMask(du, mask); + return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); +} + +// ------------------------------ CompressNot + +template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)> +HWY_API V CompressNot(V v, const M mask) { + return Compress(v, Not(mask)); +} + +template <typename T, HWY_IF_T_SIZE(T, 8)> +HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) { + // See CompressIsPartition. u64 is faster than u32. + alignas(16) static constexpr uint64_t packed_array[256] = { + // From PrintCompressNot32x8Tables, without the FirstN extension (there is + // no benefit to including them because 64-bit CompressStore is anyway + // masked, but also no harm because TableLookupLanes ignores the MSB). + 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431, + 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542, + 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321, + 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653, + 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651, + 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432, + 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421, + 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764, + 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631, + 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762, + 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321, + 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543, + 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541, + 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532, + 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521, + 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075, + 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431, + 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742, + 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321, + 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073, + 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071, + 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432, + 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421, + 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654, + 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531, + 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652, + 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321, + 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643, + 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641, + 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632, + 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621, + 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106, + 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431, + 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542, + 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321, + 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053, + 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051, + 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432, + 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421, + 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104, + 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031, + 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102, + 0x76543210, 0x76543201, 0x76543210, 0x76543210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 3) - + // _mm512_permutexvar_epi64 will ignore the upper bits. + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) static constexpr uint64_t shifts[8] = {0, 4, 8, 12, + 16, 20, 24, 28}; + const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a +// no-op for 128-bit. +template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)> +HWY_API V CompressBlocksNot(V v, M mask) { + return CompressNot(v, mask); +} + +// ------------------------------ CompressBits +template <class V> +HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(DFromV<V>(), bits)); +} + +// ------------------------------ CompressStore + +template <class V, class D, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API size_t CompressStore(V v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET == HWY_AVX3_ZEN4 + StoreU(Compress(v, mask), d, unaligned); +#else + const RebindToUnsigned<decltype(d)> du; + const auto mu = RebindMask(du, mask); + auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned); + +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + detail::NativeCompressStore(BitCast(du, v), mu, pu); +#else + detail::EmuCompressStore(BitCast(du, v), mu, du, pu); +#endif +#endif // HWY_TARGET != HWY_AVX3_ZEN4 + const size_t count = CountTrue(d, mask); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +template <class V, class D, HWY_IF_NOT_FLOAT_D(D), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> +HWY_API size_t CompressStore(V v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET == HWY_AVX3_ZEN4 + StoreU(Compress(v, mask), d, unaligned); +#else + const RebindToUnsigned<decltype(d)> du; + const auto mu = RebindMask(du, mask); + using TU = TFromD<decltype(du)>; + TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned); + detail::NativeCompressStore(BitCast(du, v), mu, pu); +#endif // HWY_TARGET != HWY_AVX3_ZEN4 + const size_t count = CountTrue(d, mask); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +// Additional overloads to avoid casting to uint32_t (delay?). +template <class D, HWY_IF_FLOAT_D(D)> // for 128..512 +HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, + TFromD<D>* HWY_RESTRICT unaligned) { +#if HWY_TARGET == HWY_AVX3_ZEN4 + StoreU(Compress(v, mask), d, unaligned); +#else + (void)d; + detail::NativeCompressStore(v, mask, unaligned); +#endif // HWY_TARGET != HWY_AVX3_ZEN4 + const size_t count = PopCount(uint64_t{mask.raw}); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +// ------------------------------ CompressBlendedStore +template <class D, HWY_IF_V_SIZE_GT_D(D, 8)> // for full 128..512 +HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, + TFromD<D>* HWY_RESTRICT unaligned) { + // Native CompressStore already does the blending at no extra cost (latency + // 11, rthroughput 2 - same as compress plus store). + if (HWY_TARGET == HWY_AVX3_DL || + (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) { + return CompressStore(v, m, d, unaligned); + } else { + const size_t count = CountTrue(d, m); + BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; + } +} + +// ------------------------------ CompressBitsStore +template <class D> // also for shorter vectors +HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD<D>* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// ------------------------------ LoadInterleaved4 + +// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4. +namespace detail { + +// Type-safe wrapper. +template <_MM_PERM_ENUM kPerm, typename T> +Vec512<T> Shuffle128(const Vec512<T> lo, const Vec512<T> hi) { + return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)}; +} +template <_MM_PERM_ENUM kPerm> +Vec512<float> Shuffle128(const Vec512<float> lo, const Vec512<float> hi) { + return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)}; +} +template <_MM_PERM_ENUM kPerm> +Vec512<double> Shuffle128(const Vec512<double> lo, const Vec512<double> hi) { + return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)}; +} + +// Input (128-bit blocks): +// 3 2 1 0 (<- first block in unaligned) +// 7 6 5 4 +// b a 9 8 +// Output: +// 9 6 3 0 (LSB of A) +// a 7 4 1 +// b 8 5 2 +template <class D, typename T = TFromD<D>> +HWY_API void LoadTransposedBlocks3(D d, const T* HWY_RESTRICT unaligned, + Vec512<T>& A, Vec512<T>& B, Vec512<T>& C) { + constexpr size_t N = 64 / sizeof(T); + const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N); + const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N); + const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N); + + const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654); + const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98); + + A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976); + B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976); + C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98); +} + +// Input (128-bit blocks): +// 3 2 1 0 (<- first block in unaligned) +// 7 6 5 4 +// b a 9 8 +// f e d c +// Output: +// c 8 4 0 (LSB of A) +// d 9 5 1 +// e a 6 2 +// f b 7 3 +template <class D, typename T = TFromD<D>> +HWY_API void LoadTransposedBlocks4(D d, const T* HWY_RESTRICT unaligned, + Vec512<T>& vA, Vec512<T>& vB, Vec512<T>& vC, + Vec512<T>& vD) { + constexpr size_t N = 64 / sizeof(T); + const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N); + const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N); + const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N); + const Vec512<T> vfedc = LoadU(d, unaligned + 3 * N); + + const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654); + const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc); + const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654); + const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc); + vA = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98); + vB = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98); + vC = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba); + vD = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba); +} + +} // namespace detail + +// ------------------------------ StoreInterleaved2 + +// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. + +namespace detail { + +// Input (128-bit blocks): +// 6 4 2 0 (LSB of i) +// 7 5 3 1 +// Output: +// 3 2 1 0 +// 7 6 5 4 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks2(const Vec512<T> i, const Vec512<T> j, D d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 64 / sizeof(T); + const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j); + const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j); + const auto j1_i1_j0_i0 = + detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0); + const auto j3_i3_j2_i2 = + detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2); + StoreU(j1_i1_j0_i0, d, unaligned + 0 * N); + StoreU(j3_i3_j2_i2, d, unaligned + 1 * N); +} + +// Input (128-bit blocks): +// 9 6 3 0 (LSB of i) +// a 7 4 1 +// b 8 5 2 +// Output: +// 3 2 1 0 +// 7 6 5 4 +// b a 9 8 +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks3(const Vec512<T> i, const Vec512<T> j, + const Vec512<T> k, D d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 64 / sizeof(T); + const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j); + const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i); + const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j); + + const Vec512<T> out0 = // i1 k0 j0 i0 + detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0); + const Vec512<T> out1 = // j2 i2 k1 j1 + detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0); + const Vec512<T> out2 = // k3 j3 i3 k2 + detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1); + + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + StoreU(out2, d, unaligned + 2 * N); +} + +// Input (128-bit blocks): +// c 8 4 0 (LSB of i) +// d 9 5 1 +// e a 6 2 +// f b 7 3 +// Output: +// 3 2 1 0 +// 7 6 5 4 +// b a 9 8 +// f e d c +template <class D, typename T = TFromD<D>> +HWY_API void StoreTransposedBlocks4(const Vec512<T> i, const Vec512<T> j, + const Vec512<T> k, const Vec512<T> l, D d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 64 / sizeof(T); + const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j); + const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l); + const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j); + const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l); + const Vec512<T> out0 = + detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0); + const Vec512<T> out1 = + detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0); + const Vec512<T> out2 = + detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2); + const Vec512<T> out3 = + detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + StoreU(out2, d, unaligned + 2 * N); + StoreU(out3, d, unaligned + 3 * N); +} + +} // namespace detail + +// ------------------------------ Shl (LoadDup128) + +HWY_API Vec512<uint16_t> operator<<(Vec512<uint16_t> v, Vec512<uint16_t> bits) { + return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)}; +} + +// 8-bit: may use the << overload for uint16_t. +HWY_API Vec512<uint8_t> operator<<(Vec512<uint8_t> v, Vec512<uint8_t> bits) { + const DFromV<decltype(v)> d; +#if HWY_TARGET <= HWY_AVX3_DL + // kMask[i] = 0xFF >> i + alignas(16) static constexpr uint8_t kMasks[16] = { + 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; + // kShl[i] = 1 << i + alignas(16) static constexpr uint8_t kShl[16] = {0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80}; + v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits)); + const VFromD<decltype(d)> mul = TableLookupBytes(LoadDup128(d, kShl), bits); + return VFromD<decltype(d)>{_mm512_gf2p8mul_epi8(v.raw, mul.raw)}; +#else + const Repartition<uint16_t, decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + const VW mask = Set(dw, 0x00FF); + const VW vw = BitCast(dw, v); + const VW bits16 = BitCast(dw, bits); + const VW evens = And(vw, mask) << And(bits16, mask); + // Shift odd lanes in-place + const VW odds = vw << ShiftRight<8>(bits16); + return BitCast(d, IfVecThenElse(Set(dw, 0xFF00), odds, evens)); +#endif +} + +HWY_API Vec512<uint32_t> operator<<(const Vec512<uint32_t> v, + const Vec512<uint32_t> bits) { + return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec512<uint64_t> operator<<(const Vec512<uint64_t> v, + const Vec512<uint64_t> bits) { + return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)}; +} + +// Signed left shift is the same as unsigned. +template <typename T, HWY_IF_SIGNED(T)> +HWY_API Vec512<T> operator<<(const Vec512<T> v, const Vec512<T> bits) { + const DFromV<decltype(v)> di; + const RebindToUnsigned<decltype(di)> du; + return BitCast(di, BitCast(du, v) << BitCast(du, bits)); +} + +// ------------------------------ Shr (IfVecThenElse) + +HWY_API Vec512<uint16_t> operator>>(const Vec512<uint16_t> v, + const Vec512<uint16_t> bits) { + return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)}; +} + +// 8-bit uses 16-bit shifts. +template <size_t N> +HWY_API Vec512<uint8_t> operator>>(Vec512<uint8_t> v, Vec512<uint8_t> bits) { + const DFromV<decltype(v)> d; + const RepartitionToWide<decltype(d)> dw; + using VW = VFromD<decltype(dw)>; + const VW mask = Set(dw, 0x00FF); + const VW vw = BitCast(dw, v); + const VW bits16 = BitCast(dw, bits); + const VW evens = And(vw, mask) >> And(bits16, mask); + // Shift odd lanes in-place + const VW odds = vw >> ShiftRight<8>(bits16); + return BitCast(d, IfVecThenElse(Set(dw, 0xFF00), odds, evens)); +} + +HWY_API Vec512<uint32_t> operator>>(const Vec512<uint32_t> v, + const Vec512<uint32_t> bits) { + return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec512<uint64_t> operator>>(const Vec512<uint64_t> v, + const Vec512<uint64_t> bits) { + return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)}; +} + +HWY_API Vec512<int16_t> operator>>(const Vec512<int16_t> v, + const Vec512<int16_t> bits) { + return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)}; +} + +HWY_API Vec512<int32_t> operator>>(const Vec512<int32_t> v, + const Vec512<int32_t> bits) { + return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v, + const Vec512<int64_t> bits) { + return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)}; +} + +// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower) + +HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a, + const Vec512<uint64_t> b) { + const DFromV<decltype(a)> du64; + const RepartitionToNarrow<decltype(du64)> du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need the lower 32 bits + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need + // the even (lower 64 bits of every 128-bit block) results. See + // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveLower(mulL, mulH); +} + +HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a, + const Vec512<uint64_t> b) { + const DFromV<decltype(a)> du64; + const RepartitionToNarrow<decltype(du64)> du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need bits [95:64] (= upper half of input) + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Same as above, but we're using the odd results (upper 64 bits per block). + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveUpper(du64, mulL, mulH); +} + +// ------------------------------ WidenMulPairwiseAdd +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec512<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a, + Vec512<int16_t> b) { + return Vec512<int32_t>{_mm512_madd_epi16(a.raw, b.raw)}; +} + +// ------------------------------ ReorderWidenMulAccumulate +template <class D, HWY_IF_I32_D(D)> +HWY_API Vec512<int32_t> ReorderWidenMulAccumulate(D d, Vec512<int16_t> a, + Vec512<int16_t> b, + const Vec512<int32_t> sum0, + Vec512<int32_t>& /*sum1*/) { + (void)d; +#if HWY_TARGET <= HWY_AVX3_DL + return Vec512<int32_t>{_mm512_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; +#else + return sum0 + WidenMulPairwiseAdd(d, a, b); +#endif +} + +HWY_API Vec512<int32_t> RearrangeToOddPlusEven(const Vec512<int32_t> sum0, + Vec512<int32_t> /*sum1*/) { + return sum0; // invariant already holds +} + +// ------------------------------ Reductions + +template <class D> +HWY_API int32_t ReduceSum(D, Vec512<int32_t> v) { + return _mm512_reduce_add_epi32(v.raw); +} +template <class D> +HWY_API int64_t ReduceSum(D, Vec512<int64_t> v) { + return _mm512_reduce_add_epi64(v.raw); +} +template <class D> +HWY_API uint32_t ReduceSum(D, Vec512<uint32_t> v) { + return static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)); +} +template <class D> +HWY_API uint64_t ReduceSum(D, Vec512<uint64_t> v) { + return static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)); +} +template <class D> +HWY_API float ReduceSum(D, Vec512<float> v) { + return _mm512_reduce_add_ps(v.raw); +} +template <class D> +HWY_API double ReduceSum(D, Vec512<double> v) { + return _mm512_reduce_add_pd(v.raw); +} +template <class D> +HWY_API uint16_t ReduceSum(D d, Vec512<uint16_t> v) { + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = ReduceSum(d32, even + odd); + return static_cast<uint16_t>(sum); +} +template <class D> +HWY_API int16_t ReduceSum(D d, Vec512<int16_t> v) { + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = ReduceSum(d32, even + odd); + return static_cast<int16_t>(sum); +} + +// Returns the sum in each lane. +template <class D, typename T> +HWY_API Vec512<T> SumOfLanes(D d, Vec512<T> v) { + return Set(d, ReduceSum(d, v)); +} + +// Returns the minimum in each lane. +template <class D> +HWY_API Vec512<int32_t> MinOfLanes(D d, Vec512<int32_t> v) { + return Set(d, _mm512_reduce_min_epi32(v.raw)); +} +template <class D> +HWY_API Vec512<int64_t> MinOfLanes(D d, Vec512<int64_t> v) { + return Set(d, _mm512_reduce_min_epi64(v.raw)); +} +template <class D> +HWY_API Vec512<uint32_t> MinOfLanes(D d, Vec512<uint32_t> v) { + return Set(d, _mm512_reduce_min_epu32(v.raw)); +} +template <class D> +HWY_API Vec512<uint64_t> MinOfLanes(D d, Vec512<uint64_t> v) { + return Set(d, _mm512_reduce_min_epu64(v.raw)); +} +template <class D> +HWY_API Vec512<float> MinOfLanes(D d, Vec512<float> v) { + return Set(d, _mm512_reduce_min_ps(v.raw)); +} +template <class D> +HWY_API Vec512<double> MinOfLanes(D d, Vec512<double> v) { + return Set(d, _mm512_reduce_min_pd(v.raw)); +} +template <class D> +HWY_API Vec512<uint16_t> MinOfLanes(D d, Vec512<uint16_t> v) { + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(d32, Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template <class D> +HWY_API Vec512<int16_t> MinOfLanes(D d, Vec512<int16_t> v) { + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(d32, Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +// Returns the maximum in each lane. +template <class D> +HWY_API Vec512<int32_t> MaxOfLanes(D d, Vec512<int32_t> v) { + return Set(d, _mm512_reduce_max_epi32(v.raw)); +} +template <class D> +HWY_API Vec512<int64_t> MaxOfLanes(D d, Vec512<int64_t> v) { + return Set(d, _mm512_reduce_max_epi64(v.raw)); +} +template <class D> +HWY_API Vec512<uint32_t> MaxOfLanes(D d, Vec512<uint32_t> v) { + return Set(d, _mm512_reduce_max_epu32(v.raw)); +} +template <class D> +HWY_API Vec512<uint64_t> MaxOfLanes(D d, Vec512<uint64_t> v) { + return Set(d, _mm512_reduce_max_epu64(v.raw)); +} +template <class D> +HWY_API Vec512<float> MaxOfLanes(D d, Vec512<float> v) { + return Set(d, _mm512_reduce_max_ps(v.raw)); +} +template <class D> +HWY_API Vec512<double> MaxOfLanes(D d, Vec512<double> v) { + return Set(d, _mm512_reduce_max_pd(v.raw)); +} +template <class D> +HWY_API Vec512<uint16_t> MaxOfLanes(D d, Vec512<uint16_t> v) { + const RepartitionToWide<decltype(d)> d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(d32, Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template <class D> +HWY_API Vec512<int16_t> MaxOfLanes(D d, Vec512<int16_t> v) { + const RepartitionToWide<decltype(d)> d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(d32, Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex + +template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_D(DFromV<V>, 64)> +HWY_API V LeadingZeroCount(V v) { + return V{_mm512_lzcnt_epi32(v.raw)}; +} + +template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_D(DFromV<V>, 64)> +HWY_API V LeadingZeroCount(V v) { + return V{_mm512_lzcnt_epi64(v.raw)}; +} + +namespace detail { + +template <class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), + HWY_IF_LANES_LE_D(DFromV<V>, 16)> +HWY_INLINE V Lzcnt32ForU8OrU16(V v) { + const DFromV<decltype(v)> d; + const Rebind<int32_t, decltype(d)> di32; + const Rebind<uint32_t, decltype(d)> du32; + + const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v)); + return DemoteTo(d, BitCast(di32, v_lz_count)); +} + +template <class V, HWY_IF_UNSIGNED_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), + HWY_IF_LANES_D(DFromV<V>, 32)> +HWY_INLINE VFromD<Rebind<uint16_t, DFromV<V>>> Lzcnt32ForU8OrU16AsU16(V v) { + const DFromV<decltype(v)> d; + const Half<decltype(d)> dh; + const Rebind<int32_t, decltype(dh)> di32; + const Rebind<uint32_t, decltype(dh)> du32; + const Rebind<uint16_t, decltype(d)> du16; + + const auto lo_v_lz_count = + LeadingZeroCount(PromoteTo(du32, LowerHalf(dh, v))); + const auto hi_v_lz_count = + LeadingZeroCount(PromoteTo(du32, UpperHalf(dh, v))); + return OrderedDemote2To(du16, BitCast(di32, lo_v_lz_count), + BitCast(di32, hi_v_lz_count)); +} + +HWY_INLINE Vec256<uint8_t> Lzcnt32ForU8OrU16(Vec256<uint8_t> v) { + const DFromV<decltype(v)> d; + const Rebind<int16_t, decltype(d)> di16; + return DemoteTo(d, BitCast(di16, Lzcnt32ForU8OrU16AsU16(v))); +} + +HWY_INLINE Vec512<uint8_t> Lzcnt32ForU8OrU16(Vec512<uint8_t> v) { + const DFromV<decltype(v)> d; + const Half<decltype(d)> dh; + const Rebind<int16_t, decltype(dh)> di16; + + const auto lo_half = LowerHalf(dh, v); + const auto hi_half = UpperHalf(dh, v); + + const auto lo_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(lo_half)); + const auto hi_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(hi_half)); + return OrderedDemote2To(d, lo_v_lz_count, hi_v_lz_count); +} + +HWY_INLINE Vec512<uint16_t> Lzcnt32ForU8OrU16(Vec512<uint16_t> v) { + return Lzcnt32ForU8OrU16AsU16(v); +} + +} // namespace detail + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API V LeadingZeroCount(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + + constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; + const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16(BitCast(du, v)); + return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}), + Set(du, TU{kNumOfBitsInT}))); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> +HWY_API V HighestSetBitIndex(V v) { + const DFromV<decltype(v)> d; + const RebindToUnsigned<decltype(d)> du; + using TU = TFromD<decltype(du)>; + return BitCast(d, + Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16(BitCast(du, v))); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> +HWY_API V HighestSetBitIndex(V v) { + const DFromV<decltype(v)> d; + using T = TFromD<decltype(d)>; + return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); +} + +template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> +HWY_API V TrailingZeroCount(V v) { + const DFromV<decltype(v)> d; + const RebindToSigned<decltype(d)> di; + using T = TFromD<decltype(d)>; + + const auto vi = BitCast(di, v); + const auto lowest_bit = BitCast(d, And(vi, Neg(vi))); + constexpr T kNumOfBitsInT{sizeof(T) * 8}; + const auto bit_idx = HighestSetBitIndex(lowest_bit); + return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) |