// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// 128-bit Arm NEON vectors and operations.
// External include guard in highway.h - see comment there.

// Arm NEON intrinsics are documented at:
// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]

#include "hwy/ops/shared-inl.h"

HWY_BEFORE_NAMESPACE();

// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
// the same target attribute as our code, see #834.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
#include <arm_neon.h>  // NOLINT(build/include_order)
HWY_DIAGNOSTICS(pop)

// Must come after arm_neon.h.
namespace hwy {
namespace HWY_NAMESPACE {

namespace detail {  // for code folding and Raw128

// Macros used to define single and double function calls for multiple types
// for full and half vectors. These macros are undefined at the end of the file.

// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
#define HWY_NEON_BUILD_TPL_1
#define HWY_NEON_BUILD_TPL_2
#define HWY_NEON_BUILD_TPL_3

// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
// extend it to int32x4x2_t packs.
#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>

// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
#define HWY_NEON_BUILD_PARAM_2(type, size) \
  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
#define HWY_NEON_BUILD_PARAM_3(type, size)                        \
  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
      const Vec128<type##_t, size> c

// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
// function.
#define HWY_NEON_BUILD_ARG_1 a.raw
#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw

// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
// itself like with some of the library "functions" such as vshlq_u8. For
// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
// expects two arguments.
#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)

// Main macro definition that defines a single function for the given type and
// size of vector, using the underlying (prefix##infix##suffix) function and
// the template, return type, parameters and arguments defined by the "args"
// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args)                                      \
  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)                  \
      name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) {            \
    return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)(                \
        HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args));    \
  }

// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
// called "name" using the set of neon functions starting with the given
// "prefix" for all the variants of certain types, as specified next to each
// macro. For example, the prefix "vsub" can be used to define the operator-
// using args=2.

// uint8_t
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
  HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args)     \
  HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args)     \
  HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args)     \
  HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)

// int8_t
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
  HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args)     \
  HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args)     \
  HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args)     \
  HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)

// uint16_t
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
  HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args)    \
  HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args)    \
  HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)

// int16_t
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
  HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args)    \
  HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args)    \
  HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)

// uint32_t
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
  HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args)    \
  HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)

// int32_t
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
  HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args)    \
  HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)

// uint64_t
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)

// int64_t
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)

// float
#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
  HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args)    \
  HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)

// double
#if HWY_ARCH_ARM_A64
#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
  HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
#else
#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
#endif

// float and double

#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)         \
  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)

// Helper macros to define for more than one type.
// uint8_t, uint16_t and uint32_t
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)             \
  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)            \
  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)

// int8_t, int16_t and int32_t
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)             \
  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)            \
  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)

// uint8_t, uint16_t, uint32_t and uint64_t
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)  \
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)

// int8_t, int16_t, int32_t and int64_t
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)  \
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)

// All int*_t and uint*_t up to 64
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)             \
  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)

// All previous types.
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)

#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)   \
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)    \
  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)

// For eor3q, which is only defined for full vectors.
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args)  \
  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args)   \
  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args)  \
  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args)  \
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)

// Emulation of some intrinsics on armv7.
#if HWY_ARCH_ARM_V7
#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
#endif

// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads
// for all vector types, even those (bfloat16_t) where the underlying vector is
// the same as others (uint16_t).
template <typename T, size_t N>
struct Tuple2;
template <typename T, size_t N>
struct Tuple3;
template <typename T, size_t N>
struct Tuple4;

template <>
struct Tuple2<uint8_t, 16> {
  uint8x16x2_t raw;
};
template <size_t N>
struct Tuple2<uint8_t, N> {
  uint8x8x2_t raw;
};
template <>
struct Tuple2<int8_t, 16> {
  int8x16x2_t raw;
};
template <size_t N>
struct Tuple2<int8_t, N> {
  int8x8x2_t raw;
};
template <>
struct Tuple2<uint16_t, 8> {
  uint16x8x2_t raw;
};
template <size_t N>
struct Tuple2<uint16_t, N> {
  uint16x4x2_t raw;
};
template <>
struct Tuple2<int16_t, 8> {
  int16x8x2_t raw;
};
template <size_t N>
struct Tuple2<int16_t, N> {
  int16x4x2_t raw;
};
template <>
struct Tuple2<uint32_t, 4> {
  uint32x4x2_t raw;
};
template <size_t N>
struct Tuple2<uint32_t, N> {
  uint32x2x2_t raw;
};
template <>
struct Tuple2<int32_t, 4> {
  int32x4x2_t raw;
};
template <size_t N>
struct Tuple2<int32_t, N> {
  int32x2x2_t raw;
};
template <>
struct Tuple2<uint64_t, 2> {
  uint64x2x2_t raw;
};
template <size_t N>
struct Tuple2<uint64_t, N> {
  uint64x1x2_t raw;
};
template <>
struct Tuple2<int64_t, 2> {
  int64x2x2_t raw;
};
template <size_t N>
struct Tuple2<int64_t, N> {
  int64x1x2_t raw;
};

template <>
struct Tuple2<float16_t, 8> {
  uint16x8x2_t raw;
};
template <size_t N>
struct Tuple2<float16_t, N> {
  uint16x4x2_t raw;
};
template <>
struct Tuple2<bfloat16_t, 8> {
  uint16x8x2_t raw;
};
template <size_t N>
struct Tuple2<bfloat16_t, N> {
  uint16x4x2_t raw;
};

template <>
struct Tuple2<float32_t, 4> {
  float32x4x2_t raw;
};
template <size_t N>
struct Tuple2<float32_t, N> {
  float32x2x2_t raw;
};
#if HWY_ARCH_ARM_A64
template <>
struct Tuple2<float64_t, 2> {
  float64x2x2_t raw;
};
template <size_t N>
struct Tuple2<float64_t, N> {
  float64x1x2_t raw;
};
#endif  // HWY_ARCH_ARM_A64

template <>
struct Tuple3<uint8_t, 16> {
  uint8x16x3_t raw;
};
template <size_t N>
struct Tuple3<uint8_t, N> {
  uint8x8x3_t raw;
};
template <>
struct Tuple3<int8_t, 16> {
  int8x16x3_t raw;
};
template <size_t N>
struct Tuple3<int8_t, N> {
  int8x8x3_t raw;
};
template <>
struct Tuple3<uint16_t, 8> {
  uint16x8x3_t raw;
};
template <size_t N>
struct Tuple3<uint16_t, N> {
  uint16x4x3_t raw;
};
template <>
struct Tuple3<int16_t, 8> {
  int16x8x3_t raw;
};
template <size_t N>
struct Tuple3<int16_t, N> {
  int16x4x3_t raw;
};
template <>
struct Tuple3<uint32_t, 4> {
  uint32x4x3_t raw;
};
template <size_t N>
struct Tuple3<uint32_t, N> {
  uint32x2x3_t raw;
};
template <>
struct Tuple3<int32_t, 4> {
  int32x4x3_t raw;
};
template <size_t N>
struct Tuple3<int32_t, N> {
  int32x2x3_t raw;
};
template <>
struct Tuple3<uint64_t, 2> {
  uint64x2x3_t raw;
};
template <size_t N>
struct Tuple3<uint64_t, N> {
  uint64x1x3_t raw;
};
template <>
struct Tuple3<int64_t, 2> {
  int64x2x3_t raw;
};
template <size_t N>
struct Tuple3<int64_t, N> {
  int64x1x3_t raw;
};

template <>
struct Tuple3<float16_t, 8> {
  uint16x8x3_t raw;
};
template <size_t N>
struct Tuple3<float16_t, N> {
  uint16x4x3_t raw;
};
template <>
struct Tuple3<bfloat16_t, 8> {
  uint16x8x3_t raw;
};
template <size_t N>
struct Tuple3<bfloat16_t, N> {
  uint16x4x3_t raw;
};

template <>
struct Tuple3<float32_t, 4> {
  float32x4x3_t raw;
};
template <size_t N>
struct Tuple3<float32_t, N> {
  float32x2x3_t raw;
};
#if HWY_ARCH_ARM_A64
template <>
struct Tuple3<float64_t, 2> {
  float64x2x3_t raw;
};
template <size_t N>
struct Tuple3<float64_t, N> {
  float64x1x3_t raw;
};
#endif  // HWY_ARCH_ARM_A64

template <>
struct Tuple4<uint8_t, 16> {
  uint8x16x4_t raw;
};
template <size_t N>
struct Tuple4<uint8_t, N> {
  uint8x8x4_t raw;
};
template <>
struct Tuple4<int8_t, 16> {
  int8x16x4_t raw;
};
template <size_t N>
struct Tuple4<int8_t, N> {
  int8x8x4_t raw;
};
template <>
struct Tuple4<uint16_t, 8> {
  uint16x8x4_t raw;
};
template <size_t N>
struct Tuple4<uint16_t, N> {
  uint16x4x4_t raw;
};
template <>
struct Tuple4<int16_t, 8> {
  int16x8x4_t raw;
};
template <size_t N>
struct Tuple4<int16_t, N> {
  int16x4x4_t raw;
};
template <>
struct Tuple4<uint32_t, 4> {
  uint32x4x4_t raw;
};
template <size_t N>
struct Tuple4<uint32_t, N> {
  uint32x2x4_t raw;
};
template <>
struct Tuple4<int32_t, 4> {
  int32x4x4_t raw;
};
template <size_t N>
struct Tuple4<int32_t, N> {
  int32x2x4_t raw;
};
template <>
struct Tuple4<uint64_t, 2> {
  uint64x2x4_t raw;
};
template <size_t N>
struct Tuple4<uint64_t, N> {
  uint64x1x4_t raw;
};
template <>
struct Tuple4<int64_t, 2> {
  int64x2x4_t raw;
};
template <size_t N>
struct Tuple4<int64_t, N> {
  int64x1x4_t raw;
};

template <>
struct Tuple4<float16_t, 8> {
  uint16x8x4_t raw;
};
template <size_t N>
struct Tuple4<float16_t, N> {
  uint16x4x4_t raw;
};
template <>
struct Tuple4<bfloat16_t, 8> {
  uint16x8x4_t raw;
};
template <size_t N>
struct Tuple4<bfloat16_t, N> {
  uint16x4x4_t raw;
};

template <>
struct Tuple4<float32_t, 4> {
  float32x4x4_t raw;
};
template <size_t N>
struct Tuple4<float32_t, N> {
  float32x2x4_t raw;
};
#if HWY_ARCH_ARM_A64
template <>
struct Tuple4<float64_t, 2> {
  float64x2x4_t raw;
};
template <size_t N>
struct Tuple4<float64_t, N> {
  float64x1x4_t raw;
};
#endif  // HWY_ARCH_ARM_A64

template <typename T, size_t N>
struct Raw128;

// 128
template <>
struct Raw128<uint8_t, 16> {
  using type = uint8x16_t;
};

template <>
struct Raw128<uint16_t, 8> {
  using type = uint16x8_t;
};

template <>
struct Raw128<uint32_t, 4> {
  using type = uint32x4_t;
};

template <>
struct Raw128<uint64_t, 2> {
  using type = uint64x2_t;
};

template <>
struct Raw128<int8_t, 16> {
  using type = int8x16_t;
};

template <>
struct Raw128<int16_t, 8> {
  using type = int16x8_t;
};

template <>
struct Raw128<int32_t, 4> {
  using type = int32x4_t;
};

template <>
struct Raw128<int64_t, 2> {
  using type = int64x2_t;
};

template <>
struct Raw128<float16_t, 8> {
  using type = uint16x8_t;
};

template <>
struct Raw128<bfloat16_t, 8> {
  using type = uint16x8_t;
};

template <>
struct Raw128<float, 4> {
  using type = float32x4_t;
};

#if HWY_ARCH_ARM_A64
template <>
struct Raw128<double, 2> {
  using type = float64x2_t;
};
#endif

// 64
template <>
struct Raw128<uint8_t, 8> {
  using type = uint8x8_t;
};

template <>
struct Raw128<uint16_t, 4> {
  using type = uint16x4_t;
};

template <>
struct Raw128<uint32_t, 2> {
  using type = uint32x2_t;
};

template <>
struct Raw128<uint64_t, 1> {
  using type = uint64x1_t;
};

template <>
struct Raw128<int8_t, 8> {
  using type = int8x8_t;
};

template <>
struct Raw128<int16_t, 4> {
  using type = int16x4_t;
};

template <>
struct Raw128<int32_t, 2> {
  using type = int32x2_t;
};

template <>
struct Raw128<int64_t, 1> {
  using type = int64x1_t;
};

template <>
struct Raw128<float16_t, 4> {
  using type = uint16x4_t;
};

template <>
struct Raw128<bfloat16_t, 4> {
  using type = uint16x4_t;
};

template <>
struct Raw128<float, 2> {
  using type = float32x2_t;
};

#if HWY_ARCH_ARM_A64
template <>
struct Raw128<double, 1> {
  using type = float64x1_t;
};
#endif

// 32 (same as 64)
template <>
struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};

template <>
struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};

template <>
struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};

template <>
struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};

template <>
struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};

template <>
struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};

template <>
struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};

template <>
struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};

template <>
struct Raw128<float, 1> : public Raw128<float, 2> {};

// 16 (same as 64)
template <>
struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};

template <>
struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};

template <>
struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};

template <>
struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};

template <>
struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};

template <>
struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};

// 8 (same as 64)
template <>
struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};

template <>
struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};

}  // namespace detail

template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
  using Raw = typename detail::Raw128<T, N>::type;

 public:
  using PrivateT = T;                     // only for DFromV
  static constexpr size_t kPrivateN = N;  // only for DFromV

  HWY_INLINE Vec128() {}
  Vec128(const Vec128&) = default;
  Vec128& operator=(const Vec128&) = default;
  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}

  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec128& operator*=(const Vec128 other) {
    return *this = (*this * other);
  }
  HWY_INLINE Vec128& operator/=(const Vec128 other) {
    return *this = (*this / other);
  }
  HWY_INLINE Vec128& operator+=(const Vec128 other) {
    return *this = (*this + other);
  }
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
    return *this = (*this - other);
  }
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
    return *this = (*this & other);
  }
  HWY_INLINE Vec128& operator|=(const Vec128 other) {
    return *this = (*this | other);
  }
  HWY_INLINE Vec128& operator^=(const Vec128 other) {
    return *this = (*this ^ other);
  }

  Raw raw;
};

template <typename T>
using Vec64 = Vec128<T, 8 / sizeof(T)>;

template <typename T>
using Vec32 = Vec128<T, 4 / sizeof(T)>;

template <typename T>
using Vec16 = Vec128<T, 2 / sizeof(T)>;

// FF..FF or 0.
template <typename T, size_t N = 16 / sizeof(T)>
class Mask128 {
  // Arm C Language Extensions return and expect unsigned type.
  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;

 public:
  using PrivateT = T;                     // only for DFromM
  static constexpr size_t kPrivateN = N;  // only for DFromM

  HWY_INLINE Mask128() {}
  Mask128(const Mask128&) = default;
  Mask128& operator=(const Mask128&) = default;
  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}

  Raw raw;
};

template <typename T>
using Mask64 = Mask128<T, 8 / sizeof(T)>;

template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

template <class M>
using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;

template <class V>
using TFromV = typename V::PrivateT;

// ------------------------------ Set

namespace detail {
// We want to route any combination of N/kPow2 to the intrinsics depending on
// whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is
// unconditional and currently does not accept inputs (such as whether the
// vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for
// SFINAE. We instead define a private NativeSet which receives a Simd<> whose
// kPow2 has already been folded into its N.
#define HWY_NEON_BUILD_TPL_HWY_SET
#define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \
  Simd<type##_t, size, 0> /* tag */, type##_t t
#define HWY_NEON_BUILD_ARG_HWY_SET t

HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)

#undef HWY_NEON_BUILD_TPL_HWY_SET
#undef HWY_NEON_BUILD_RET_HWY_SET
#undef HWY_NEON_BUILD_PARAM_HWY_SET
#undef HWY_NEON_BUILD_ARG_HWY_SET

}  // namespace detail

// Full vector. Cannot yet use VFromD because that is defined in terms of Set.
// Do not use a typename T = TFromD<D> argument because T will be deduced from
// the actual argument type, which can differ from TFromD<D>.
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T>
HWY_INLINE Vec128<TFromD<D>> Set(D /* tag */, T t) {
  return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t));
}

// Partial vector: create 64-bit and return wrapper.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T>
HWY_API Vec128<TFromD<D>, MaxLanes(D())> Set(D /* tag */, T t) {
  const Full64<TFromD<D>> dfull;
  return Vec128<TFromD<D>, MaxLanes(D())>(
      detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
}

// BF16: return u16.
template <class D, HWY_IF_BF16_D(D)>
HWY_API Vec128<bfloat16_t, MaxLanes(D())> Set(D d, bfloat16_t t) {
  uint16_t tu;
  CopyBytes<sizeof(tu)>(&t, &tu);
  return Vec128<bfloat16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
}

// F16: return u16.
template <class D, HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t, MaxLanes(D())> Set(D d, float16_t t) {
  uint16_t tu;
  CopyBytes<sizeof(tu)>(&t, &tu);
  return Vec128<float16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
}

template <class D>
using VFromD = decltype(Set(D(), TFromD<D>()));

template <class D>
HWY_API VFromD<D> Zero(D d) {
  // Default ctor also works for bfloat16_t and float16_t.
  return Set(d, TFromD<D>{});
}

HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
#endif

template <class D>
HWY_API VFromD<D> Undefined(D /*tag*/) {
  VFromD<D> v;
  return v;
}

HWY_DIAGNOSTICS(pop)

namespace detail {

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(8)));
  constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
  const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x8_t>(kU8Iota0));
#else
  alignas(8) static constexpr uint8_t kU8Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  const VFromD<decltype(du)> vu8_iota0(
      Load(Full64<TFromD<decltype(du)>>(), kU8Iota0).raw);
#endif
  return BitCast(d, vu8_iota0);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
  constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2,  3,  4,  5,  6,  7,
                                         8, 9, 10, 11, 12, 13, 14, 15};
  const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x16_t>(kU8Iota0));
#else
  alignas(16) static constexpr uint8_t kU8Iota0[16] = {
      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  const auto vu8_iota0 = Load(du, kU8Iota0);
#endif
  return BitCast(d, vu8_iota0);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2),
          HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(8)));
  constexpr GccU16RawVectType kU16Iota0 = {0, 1, 2, 3};
  const VFromD<decltype(du)> vu16_iota0(
      reinterpret_cast<uint16x4_t>(kU16Iota0));
#else
  alignas(8) static constexpr uint16_t kU16Iota0[4] = {0, 1, 2, 3};
  const VFromD<decltype(du)> vu16_iota0{
      Load(Full64<TFromD<decltype(du)>>(), kU16Iota0).raw};
#endif
  return BitCast(d, vu16_iota0);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2),
          HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16)));
  constexpr GccU16RawVectType kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
  const VFromD<decltype(du)> vu16_iota0(
      reinterpret_cast<uint16x8_t>(kU16Iota0));
#else
  alignas(16) static constexpr uint16_t kU16Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  const auto vu16_iota0 = Load(du, kU16Iota0);
#endif
  return BitCast(d, vu16_iota0);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_UI32_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8)));
  constexpr GccU32RawVectType kU32Iota0 = {0, 1};
  const VFromD<decltype(du)> vu32_iota0(
      reinterpret_cast<uint32x2_t>(kU32Iota0));
#else
  alignas(8) static constexpr uint32_t kU32Iota0[2] = {0, 1};
  const VFromD<decltype(du)> vu32_iota0{
      Load(Full64<TFromD<decltype(du)>>(), kU32Iota0).raw};
#endif
  return BitCast(d, vu32_iota0);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
  constexpr GccU32RawVectType kU32Iota0 = {0, 1, 2, 3};
  const VFromD<decltype(du)> vu32_iota0(
      reinterpret_cast<uint32x4_t>(kU32Iota0));
#else
  alignas(16) static constexpr uint32_t kU32Iota0[4] = {0, 1, 2, 3};
  const auto vu32_iota0 = Load(du, kU32Iota0);
#endif
  return BitCast(d, vu32_iota0);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
  constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f};
  return VFromD<decltype(d)>(reinterpret_cast<float32x2_t>(kF32Iota0));
#else
  alignas(8) static constexpr float kF32Iota0[2] = {0.0f, 1.0f};
  return VFromD<decltype(d)>{
      Load(Full64<TFromD<decltype(d)>>(), kF32Iota0).raw};
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
  return VFromD<decltype(d)>(reinterpret_cast<float32x4_t>(kF32Iota0));
#else
  alignas(16) static constexpr float kF32Iota0[4] = {0.0f, 1.0f, 2.0f, 3.0f};
  return Load(d, kF32Iota0);
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> Iota0(D d) {
  return Zero(d);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef uint64_t GccU64RawVectType __attribute__((__vector_size__(16)));
  constexpr GccU64RawVectType kU64Iota0 = {0, 1};
  const VFromD<decltype(du)> vu64_iota0(
      reinterpret_cast<uint64x2_t>(kU64Iota0));
#else
  alignas(16) static constexpr uint64_t kU64Iota0[4] = {0, 1};
  const auto vu64_iota0 = Load(du, kU64Iota0);
#endif
  return BitCast(d, vu64_iota0);
}

#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  constexpr GccF64RawVectType kF64Iota0 = {0.0, 1.0};
  return VFromD<decltype(d)>(reinterpret_cast<float64x2_t>(kF64Iota0));
#else
  alignas(16) static constexpr double kF64Iota0[4] = {0.0, 1.0};
  return Load(d, kF64Iota0);
#endif
}
#endif  // HWY_ARCH_ARM_A64

#if HWY_COMPILER_MSVC
template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
static HWY_INLINE V MaskOutIota(V v) {
  constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV<V>);
  constexpr uint64_t kU64MaskOutMask =
      hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>();

  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  using VU8 = VFromD<decltype(du8)>;
  const auto mask_out_mask =
      BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask))));
  return v & mask_out_mask;
}
template <class V, HWY_IF_V_SIZE_GT_V(V, 4)>
static HWY_INLINE V MaskOutIota(V v) {
  return v;
}
#endif

}  // namespace detail

template <class D, typename T2>
HWY_API VFromD<D> Iota(D d, const T2 first) {
  const auto result_iota =
      detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
#if HWY_COMPILER_MSVC
  return detail::MaskOutIota(result_iota);
#else
  return result_iota;
#endif
}

// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"

// ------------------------------ Combine

// Full result
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> Combine(D /* tag */, Vec64<uint8_t> hi,
                                Vec64<uint8_t> lo) {
  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
}
template <class D, HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> Combine(D /* tag */, Vec64<uint16_t> hi,
                                 Vec64<uint16_t> lo) {
  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> Combine(D /* tag */, Vec64<uint32_t> hi,
                                 Vec64<uint32_t> lo) {
  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
}
template <class D, HWY_IF_U64_D(D)>
HWY_API Vec128<uint64_t> Combine(D /* tag */, Vec64<uint64_t> hi,
                                 Vec64<uint64_t> lo) {
  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
}

template <class D, HWY_IF_I8_D(D)>
HWY_API Vec128<int8_t> Combine(D /* tag */, Vec64<int8_t> hi,
                               Vec64<int8_t> lo) {
  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> Combine(D /* tag */, Vec64<int16_t> hi,
                                Vec64<int16_t> lo) {
  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> Combine(D /* tag */, Vec64<int32_t> hi,
                                Vec64<int32_t> lo) {
  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
                                Vec64<int64_t> lo) {
  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
}

template <class D, HWY_IF_F32_D(D)>
HWY_API Vec128<float> Combine(D /* tag */, Vec64<float> hi, Vec64<float> lo) {
  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_F64_D(D)>
HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi,
                               Vec64<double> lo) {
  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
}
#endif

// ------------------------------ BitCast

namespace detail {

// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
// vreinterpret*_u8_*() set of functions.
#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
  Vec128<uint8_t, size * sizeof(type##_t)>
#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw

// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
template <size_t N>
HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
  return v;
}

HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
                                 HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)

// Special cases for [b]float16_t, which have the same Raw as uint16_t.
template <size_t N>
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
}
template <size_t N>
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
}

#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8

template <class D, HWY_IF_U8_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
  return v;
}

// 64-bit or less:

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<RebindToUnsigned<D>> v) {
  return VFromD<D>(vreinterpret_s8_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_u16_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_s16_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_u32_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_s32_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_f32_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
  return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
  return Vec64<double>(vreinterpret_f64_u8(v.raw));
}
#endif

// 128-bit full:

template <class D, HWY_IF_I8_D(D)>
HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
}
template <class D, HWY_IF_U16_D(D)>
HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
}
template <class D, HWY_IF_I32_D(D)>
HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
}
template <class D, HWY_IF_F32_D(D)>
HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
}
template <class D, HWY_IF_U64_D(D)>
HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
}

#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_F64_D(D)>
HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
}
#endif

// Special cases for [b]float16_t, which have the same Raw as uint16_t.
template <class D, HWY_IF_F16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
}
template <class D, HWY_IF_BF16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
}

}  // namespace detail

template <class D, class FromT>
HWY_API VFromD<D> BitCast(D d,
                          Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
}

// ------------------------------ ResizeBitCast

// <= 8 byte vector to <= 8 byte vector
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
          HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
}

// 16-byte vector to 16-byte vector: same as BitCast
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
          HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  return BitCast(d, v);
}

// 16-byte vector to <= 8-byte vector
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
          HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const DFromV<decltype(v)> d_from;
  const Half<decltype(d_from)> dh_from;
  return ResizeBitCast(d, LowerHalf(dh_from, v));
}

// <= 8-bit vector to 16-byte vector
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
          HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const Full64<TFromV<FromV>> d_full64_from;
  const Full128<TFromV<FromV>> d_full128_from;
  return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
                            ResizeBitCast(d_full64_from, v)));
}

// ------------------------------ GetLane

namespace detail {
#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane

HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)

#undef HWY_NEON_BUILD_TPL_HWY_GET
#undef HWY_NEON_BUILD_RET_HWY_GET
#undef HWY_NEON_BUILD_PARAM_HWY_GET
#undef HWY_NEON_BUILD_ARG_HWY_GET

}  // namespace detail

template <class V>
HWY_API TFromV<V> GetLane(const V v) {
  return detail::GetLane<0>(v);
}

// ------------------------------ ExtractLane

// Requires one overload per vector length because GetLane<3> is a compile error
// if v is a uint32x2_t.
template <typename T>
HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
  HWY_DASSERT(i == 0);
  (void)i;
  return detail::GetLane<0>(v);
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
    }
  }
#endif
  alignas(16) T lanes[2];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
      case 2:
        return detail::GetLane<2>(v);
      case 3:
        return detail::GetLane<3>(v);
    }
  }
#endif
  alignas(16) T lanes[4];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
      case 2:
        return detail::GetLane<2>(v);
      case 3:
        return detail::GetLane<3>(v);
      case 4:
        return detail::GetLane<4>(v);
      case 5:
        return detail::GetLane<5>(v);
      case 6:
        return detail::GetLane<6>(v);
      case 7:
        return detail::GetLane<7>(v);
    }
  }
#endif
  alignas(16) T lanes[8];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
      case 2:
        return detail::GetLane<2>(v);
      case 3:
        return detail::GetLane<3>(v);
      case 4:
        return detail::GetLane<4>(v);
      case 5:
        return detail::GetLane<5>(v);
      case 6:
        return detail::GetLane<6>(v);
      case 7:
        return detail::GetLane<7>(v);
      case 8:
        return detail::GetLane<8>(v);
      case 9:
        return detail::GetLane<9>(v);
      case 10:
        return detail::GetLane<10>(v);
      case 11:
        return detail::GetLane<11>(v);
      case 12:
        return detail::GetLane<12>(v);
      case 13:
        return detail::GetLane<13>(v);
      case 14:
        return detail::GetLane<14>(v);
      case 15:
        return detail::GetLane<15>(v);
    }
  }
#endif
  alignas(16) T lanes[16];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

// ------------------------------ InsertLane

namespace detail {
#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
  Vec128<type##_t, size> v, type##_t t
#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane

HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)

#undef HWY_NEON_BUILD_TPL_HWY_INSERT
#undef HWY_NEON_BUILD_RET_HWY_INSERT
#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
#undef HWY_NEON_BUILD_ARG_HWY_INSERT

}  // namespace detail

// Requires one overload per vector length because InsertLane<3> may be a
// compile error.

template <typename T>
HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
  HWY_DASSERT(i == 0);
  (void)i;
  return Set(DFromV<decltype(v)>(), t);
}

template <typename T>
HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[2];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

template <typename T>
HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[4];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

template <typename T>
HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
      case 4:
        return detail::InsertLane<4>(v, t);
      case 5:
        return detail::InsertLane<5>(v, t);
      case 6:
        return detail::InsertLane<6>(v, t);
      case 7:
        return detail::InsertLane<7>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[8];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

template <typename T>
HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
      case 4:
        return detail::InsertLane<4>(v, t);
      case 5:
        return detail::InsertLane<5>(v, t);
      case 6:
        return detail::InsertLane<6>(v, t);
      case 7:
        return detail::InsertLane<7>(v, t);
      case 8:
        return detail::InsertLane<8>(v, t);
      case 9:
        return detail::InsertLane<9>(v, t);
      case 10:
        return detail::InsertLane<10>(v, t);
      case 11:
        return detail::InsertLane<11>(v, t);
      case 12:
        return detail::InsertLane<12>(v, t);
      case 13:
        return detail::InsertLane<13>(v, t);
      case 14:
        return detail::InsertLane<14>(v, t);
      case 15:
        return detail::InsertLane<15>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[16];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

// ================================================== ARITHMETIC

// ------------------------------ Addition
HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)

// ------------------------------ Subtraction
HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)

// ------------------------------ SumsOf8

HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
  return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
}
HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
}

// ------------------------------ SaturatedAdd

#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif

#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U32_SATURATED_ADDSUB
#endif

#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif

#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
#undef HWY_NATIVE_U64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U64_SATURATED_ADDSUB
#endif

// Returns a + b clamped to the destination range.
HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedAdd, vqadd, _, 2)

// ------------------------------ SaturatedSub

// Returns a - b clamped to the destination range.
HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2)

// ------------------------------ Average

// Returns (a + b + 1) / 2
HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)

// ------------------------------ Neg

HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)  // i64 implemented below

HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
#if HWY_ARCH_ARM_A64
  return Vec64<int64_t>(vneg_s64(v.raw));
#else
  return Zero(DFromV<decltype(v)>()) - v;
#endif
}

HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
#if HWY_ARCH_ARM_A64
  return Vec128<int64_t>(vnegq_s64(v.raw));
#else
  return Zero(DFromV<decltype(v)>()) - v;
#endif
}

// ------------------------------ ShiftLeft

// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
#pragma push_macro("HWY_NEON_DEF_FUNCTION")
#undef HWY_NEON_DEF_FUNCTION
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)   \
  template <int kBits>                                                         \
  HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) {        \
    return kBits == 0 ? v                                                      \
                      : Vec128<type##_t, size>(HWY_NEON_EVAL(                  \
                            prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
  }

HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)

HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)

#pragma pop_macro("HWY_NEON_DEF_FUNCTION")

// ------------------------------ RotateRight (ShiftRight, Or)
template <int kBits, typename T, size_t N>
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
  constexpr size_t kSizeInBits = sizeof(T) * 8;
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
  if (kBits == 0) return v;
  return Or(ShiftRight<kBits>(v),
            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
}

// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
// mechanism for checking for extensions to Armv8.

// ------------------------------ Shl

HWY_API Vec128<uint8_t> operator<<(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
HWY_API Vec128<uint8_t, N> operator<<(Vec128<uint8_t, N> v,
                                      Vec128<uint8_t, N> bits) {
  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
}

HWY_API Vec128<uint16_t> operator<<(Vec128<uint16_t> v, Vec128<uint16_t> bits) {
  return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<uint16_t, N> operator<<(Vec128<uint16_t, N> v,
                                       Vec128<uint16_t, N> bits) {
  return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
}

HWY_API Vec128<uint32_t> operator<<(Vec128<uint32_t> v, Vec128<uint32_t> bits) {
  return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<uint32_t, N> operator<<(Vec128<uint32_t, N> v,
                                       Vec128<uint32_t, N> bits) {
  return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
}

HWY_API Vec128<uint64_t> operator<<(Vec128<uint64_t> v, Vec128<uint64_t> bits) {
  return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
}
HWY_API Vec64<uint64_t> operator<<(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
  return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
}

HWY_API Vec128<int8_t> operator<<(Vec128<int8_t> v, Vec128<int8_t> bits) {
  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
HWY_API Vec128<int8_t, N> operator<<(Vec128<int8_t, N> v,
                                     Vec128<int8_t, N> bits) {
  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
}

HWY_API Vec128<int16_t> operator<<(Vec128<int16_t> v, Vec128<int16_t> bits) {
  return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> operator<<(Vec128<int16_t, N> v,
                                      Vec128<int16_t, N> bits) {
  return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
}

HWY_API Vec128<int32_t> operator<<(Vec128<int32_t> v, Vec128<int32_t> bits) {
  return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
HWY_API Vec128<int32_t, N> operator<<(Vec128<int32_t, N> v,
                                      Vec128<int32_t, N> bits) {
  return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
}

HWY_API Vec128<int64_t> operator<<(Vec128<int64_t> v, Vec128<int64_t> bits) {
  return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
}
HWY_API Vec64<int64_t> operator<<(Vec64<int64_t> v, Vec64<int64_t> bits) {
  return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
}

// ------------------------------ Shr (Neg)

HWY_API Vec128<uint8_t> operator>>(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> v,
                                      Vec128<uint8_t, N> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
}

HWY_API Vec128<uint16_t> operator>>(Vec128<uint16_t> v, Vec128<uint16_t> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> v,
                                       Vec128<uint16_t, N> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
}

HWY_API Vec128<uint32_t> operator>>(Vec128<uint32_t> v, Vec128<uint32_t> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<uint32_t, N> operator>>(Vec128<uint32_t, N> v,
                                       Vec128<uint32_t, N> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
}

HWY_API Vec128<uint64_t> operator>>(Vec128<uint64_t> v, Vec128<uint64_t> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
}
HWY_API Vec64<uint64_t> operator>>(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw;
  return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
}

HWY_API Vec128<int8_t> operator>>(Vec128<int8_t> v, Vec128<int8_t> bits) {
  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v,
                                     Vec128<int8_t, N> bits) {
  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
}

HWY_API Vec128<int16_t> operator>>(Vec128<int16_t> v, Vec128<int16_t> bits) {
  return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v,
                                      Vec128<int16_t, N> bits) {
  return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
}

HWY_API Vec128<int32_t> operator>>(Vec128<int32_t> v, Vec128<int32_t> bits) {
  return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v,
                                      Vec128<int32_t, N> bits) {
  return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
}

HWY_API Vec128<int64_t> operator>>(Vec128<int64_t> v, Vec128<int64_t> bits) {
  return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
}
HWY_API Vec64<int64_t> operator>>(Vec64<int64_t> v, Vec64<int64_t> bits) {
  return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
}

// ------------------------------ ShiftLeftSame (Shl)

template <typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) {
  return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
  return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits));
}

// ------------------------------ Int/float multiplication

// Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*.
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif

// All except ui64
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2)
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)

// ------------------------------ Integer multiplication

// Returns the upper 16 bits of a * b in each lane.
HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) {
  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
#if HWY_ARCH_ARM_A64
  int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
#else
  int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
#endif
  return Vec128<int16_t>(
      vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
}
HWY_API Vec128<uint16_t> MulHigh(Vec128<uint16_t> a, Vec128<uint16_t> b) {
  uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
#if HWY_ARCH_ARM_A64
  uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
#else
  uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
#endif
  return Vec128<uint16_t>(
      vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
}

template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
  int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
  return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
                                    Vec128<uint16_t, N> b) {
  uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
}

HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
  return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
                                           Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
}

// ------------------------------ Floating-point division

// Approximate reciprocal
HWY_API Vec128<float> ApproximateReciprocal(const Vec128<float> v) {
  return Vec128<float>(vrecpeq_f32(v.raw));
}
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
  return Vec128<float, N>(vrecpe_f32(v.raw));
}

#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
#else
// Not defined on armv7: approximate
namespace detail {

HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
    const Vec128<float> recip, const Vec128<float> divisor) {
  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
}
template <size_t N>
HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
    const Vec128<float, N> recip, Vec128<float, N> divisor) {
  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
}

}  // namespace detail

template <size_t N>
HWY_API Vec128<float, N> operator/(Vec128<float, N> a, Vec128<float, N> b) {
  auto x = ApproximateReciprocal(b);
  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
  return a * x;
}
#endif

// ------------------------------ Absolute value of difference.

HWY_API Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
  return Vec128<float>(vabdq_f32(a.raw, b.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
                                 const Vec128<float, N> b) {
  return Vec128<float, N>(vabd_f32(a.raw, b.raw));
}

#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
#undef HWY_NATIVE_INTEGER_ABS_DIFF
#else
#define HWY_NATIVE_INTEGER_ABS_DIFF
#endif

HWY_API Vec128<int8_t> AbsDiff(const Vec128<int8_t> a, const Vec128<int8_t> b) {
  return Vec128<int8_t>(vabdq_s8(a.raw, b.raw));
}

HWY_API Vec128<uint8_t> AbsDiff(const Vec128<uint8_t> a,
                                const Vec128<uint8_t> b) {
  return Vec128<uint8_t>(vabdq_u8(a.raw, b.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
HWY_API Vec128<int8_t, N> AbsDiff(const Vec128<int8_t, N> a,
                                  const Vec128<int8_t, N> b) {
  return Vec128<int8_t, N>(vabd_s8(a.raw, b.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
HWY_API Vec128<uint8_t, N> AbsDiff(const Vec128<uint8_t, N> a,
                                   const Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>(vabd_u8(a.raw, b.raw));
}

HWY_API Vec128<int16_t> AbsDiff(const Vec128<int16_t> a,
                                const Vec128<int16_t> b) {
  return Vec128<int16_t>(vabdq_s16(a.raw, b.raw));
}

HWY_API Vec128<uint16_t> AbsDiff(const Vec128<uint16_t> a,
                                 const Vec128<uint16_t> b) {
  return Vec128<uint16_t>(vabdq_u16(a.raw, b.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<int16_t, N> AbsDiff(const Vec128<int16_t, N> a,
                                   const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>(vabd_s16(a.raw, b.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<uint16_t, N> AbsDiff(const Vec128<uint16_t, N> a,
                                    const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>(vabd_u16(a.raw, b.raw));
}

HWY_API Vec128<int32_t> AbsDiff(const Vec128<int32_t> a,
                                const Vec128<int32_t> b) {
  return Vec128<int32_t>(vabdq_s32(a.raw, b.raw));
}

HWY_API Vec128<uint32_t> AbsDiff(const Vec128<uint32_t> a,
                                 const Vec128<uint32_t> b) {
  return Vec128<uint32_t>(vabdq_u32(a.raw, b.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<int32_t, N> AbsDiff(const Vec128<int32_t, N> a,
                                   const Vec128<int32_t, N> b) {
  return Vec128<int32_t, N>(vabd_s32(a.raw, b.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<uint32_t, N> AbsDiff(const Vec128<uint32_t, N> a,
                                    const Vec128<uint32_t, N> b) {
  return Vec128<uint32_t, N>(vabd_u32(a.raw, b.raw));
}

// ------------------------------ Integer multiply-add

// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
#ifdef HWY_NATIVE_INT_FMA
#undef HWY_NATIVE_INT_FMA
#else
#define HWY_NATIVE_INT_FMA
#endif

// Wrappers for changing argument order to what intrinsics expect.
namespace detail {
// All except ui64
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(MulAdd, vmla, _, 3)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(MulAdd, vmla, _, 3)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(NegMulAdd, vmls, _, 3)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(NegMulAdd, vmls, _, 3)
}  // namespace detail

template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                            Vec128<T, N> add) {
  return detail::MulAdd(add, mul, x);
}

template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                               Vec128<T, N> add) {
  return detail::NegMulAdd(add, mul, x);
}

// 64-bit integer
template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                            Vec128<T, N> add) {
  return Add(Mul(mul, x), add);
}

template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                               Vec128<T, N> add) {
  return Sub(add, Mul(mul, x));
}

// ------------------------------ Floating-point multiply-add variants

#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64

template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
                                Vec128<float, N> add) {
  return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
}
HWY_API Vec128<float> MulAdd(Vec128<float> mul, Vec128<float> x,
                             Vec128<float> add) {
  return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
                                   Vec128<float, N> add) {
  return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
}
HWY_API Vec128<float> NegMulAdd(Vec128<float> mul, Vec128<float> x,
                                Vec128<float> add) {
  return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
}

#else  // emulate

template <size_t N>
HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
                                Vec128<float, N> add) {
  return mul * x + add;
}

template <size_t N>
HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
                                   Vec128<float, N> add) {
  return add - mul * x;
}

#endif  // defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64

#if HWY_ARCH_ARM_A64

HWY_API Vec64<double> MulAdd(Vec64<double> mul, Vec64<double> x,
                             Vec64<double> add) {
  return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
}
HWY_API Vec128<double> MulAdd(Vec128<double> mul, Vec128<double> x,
                              Vec128<double> add) {
  return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
}

HWY_API Vec64<double> NegMulAdd(Vec64<double> mul, Vec64<double> x,
                                Vec64<double> add) {
  return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
}
HWY_API Vec128<double> NegMulAdd(Vec128<double> mul, Vec128<double> x,
                                 Vec128<double> add) {
  return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
}

#endif  // HWY_ARCH_ARM_A64

template <typename T, size_t N>
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
                            Vec128<T, N> sub) {
  return MulAdd(mul, x, Neg(sub));
}

template <typename T, size_t N>
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
                               Vec128<T, N> sub) {
  return Neg(MulAdd(mul, x, sub));
}

// ------------------------------ Floating-point square root (IfThenZeroElse)

// Approximate reciprocal square root
HWY_API Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
  return Vec128<float>(vrsqrteq_f32(v.raw));
}
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
  return Vec128<float, N>(vrsqrte_f32(v.raw));
}

// Full precision square root
#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
#else
namespace detail {

HWY_INLINE Vec128<float> ReciprocalSqrtStep(Vec128<float> root,
                                            Vec128<float> recip) {
  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
}
template <size_t N>
HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(Vec128<float, N> root,
                                               Vec128<float, N> recip) {
  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
}

}  // namespace detail

// Not defined on armv7: approximate
template <size_t N>
HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
  auto recip = ApproximateReciprocalSqrt(v);

  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
  recip *= detail::ReciprocalSqrtStep(v * recip, recip);

  const auto root = v * recip;
  return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root);
}
#endif

// ================================================== LOGICAL

// ------------------------------ Not

// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
template <typename T>
HWY_API Vec128<T> Not(const Vec128<T> v) {
  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
}
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> d8;
  using V8 = decltype(Zero(d8));
  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
}

// ------------------------------ And
HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2)

// Uses the u32/64 defined above.
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, BitCast(du, a) & BitCast(du, b));
}

// ------------------------------ AndNot

namespace detail {
// reversed_andnot returns a & ~b.
HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
}  // namespace detail

// Returns ~not_mask & mask.
template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
                            const Vec128<T, N> mask) {
  return detail::reversed_andnot(mask, not_mask);
}

// Uses the u32/64 defined above.
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
                            const Vec128<T, N> mask) {
  const DFromV<decltype(mask)> d;
  const RebindToUnsigned<decltype(d)> du;
  VFromD<decltype(du)> ret =
      detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
  return BitCast(d, ret);
}

// ------------------------------ Or

HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2)

// Uses the u32/64 defined above.
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, BitCast(du, a) | BitCast(du, b));
}

// ------------------------------ Xor

HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2)

// Uses the u32/64 defined above.
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
}

// ------------------------------ Xor3
#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3)

// Half vectors are not natively supported. Two Xor are likely more efficient
// than Combine to 128-bit.
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_NOT_FLOAT(T)>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
  return Xor(x1, Xor(x2, x3));
}

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Xor3(const Vec128<T, N> x1, const Vec128<T, N> x2,
                          const Vec128<T, N> x3) {
  const DFromV<decltype(x1)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
}

#else
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
  return Xor(x1, Xor(x2, x3));
}
#endif

// ------------------------------ Or3
template <typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
  return Or(o1, Or(o2, o3));
}

// ------------------------------ OrAnd
template <typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
  return Or(o, And(a1, a2));
}

// ------------------------------ IfVecThenElse
template <typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
  return IfThenElse(MaskFromVec(mask), yes, no);
}

// ------------------------------ BitwiseIfThenElse

#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif

template <class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
  return IfVecThenElse(mask, yes, no);
}

// ------------------------------ Operator overloads (internal-only if float)

template <typename T, size_t N>
HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
  return And(a, b);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Or(a, b);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Xor(a, b);
}

// ------------------------------ I64/U64 AbsDiff

template <size_t N>
HWY_API Vec128<int64_t, N> AbsDiff(const Vec128<int64_t, N> a,
                                   const Vec128<int64_t, N> b) {
  return Max(a, b) - Min(a, b);
}

template <size_t N>
HWY_API Vec128<uint64_t, N> AbsDiff(const Vec128<uint64_t, N> a,
                                    const Vec128<uint64_t, N> b) {
  return Or(SaturatedSub(a, b), SaturatedSub(b, a));
}

// ------------------------------ PopulationCount

#ifdef HWY_NATIVE_POPCNT
#undef HWY_NATIVE_POPCNT
#else
#define HWY_NATIVE_POPCNT
#endif

namespace detail {

template <typename T>
HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec128<T> v) {
  const Full128<uint8_t> d8;
  return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
}
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
                                        Vec128<T, N> v) {
  const Simd<uint8_t, N, 0> d8;
  return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
}

// NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
template <typename T>
HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec128<T> v) {
  const Full128<uint8_t> d8;
  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
  return Vec128<T>(vpaddlq_u8(bytes));
}
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
                                        Vec128<T, N> v) {
  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
  return Vec128<T, N>(vpaddl_u8(bytes));
}

template <typename T>
HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec128<T> v) {
  const Full128<uint8_t> d8;
  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
  return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
}
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
                                        Vec128<T, N> v) {
  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
  return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
}

template <typename T>
HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec128<T> v) {
  const Full128<uint8_t> d8;
  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
  return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
}
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
                                        Vec128<T, N> v) {
  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
  return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
}

}  // namespace detail

template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
}

// ================================================== SIGN

// ------------------------------ Abs

// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
HWY_API Vec128<int8_t> Abs(const Vec128<int8_t> v) {
  return Vec128<int8_t>(vabsq_s8(v.raw));
}
HWY_API Vec128<int16_t> Abs(const Vec128<int16_t> v) {
  return Vec128<int16_t>(vabsq_s16(v.raw));
}
HWY_API Vec128<int32_t> Abs(const Vec128<int32_t> v) {
  return Vec128<int32_t>(vabsq_s32(v.raw));
}
// i64 is implemented after BroadcastSignBit.
HWY_API Vec128<float> Abs(const Vec128<float> v) {
  return Vec128<float>(vabsq_f32(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
  return Vec128<int8_t, N>(vabs_s8(v.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
  return Vec128<int16_t, N>(vabs_s16(v.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
  return Vec128<int32_t, N>(vabs_s32(v.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
  return Vec128<float, N>(vabs_f32(v.raw));
}

#if HWY_ARCH_ARM_A64
HWY_API Vec128<double> Abs(const Vec128<double> v) {
  return Vec128<double>(vabsq_f64(v.raw));
}

HWY_API Vec64<double> Abs(const Vec64<double> v) {
  return Vec64<double>(vabs_f64(v.raw));
}
#endif

// ------------------------------ CopySign

template <typename T, size_t N>
HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
  const auto msb = SignBit(DFromV<decltype(magn)>());
  return Or(AndNot(msb, magn), And(msb, sign));
}

template <typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
}

// ------------------------------ BroadcastSignBit

template <typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
  return ShiftRight<sizeof(T) * 8 - 1>(v);
}

// ================================================== MASK

// ------------------------------ To/from vector

// Mask and Vec have the same representation (true = FF..FF).
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
  const Simd<MakeUnsigned<T>, N, 0> du;
  return Mask128<T, N>(BitCast(du, v).raw);
}

template <class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));

template <class D>
HWY_API VFromD<D> VecFromMask(D d, const MFromD<D> m) {
  // Raw type of masks is unsigned.
  const RebindToUnsigned<D> du;
  return BitCast(d, VFromD<decltype(du)>(m.raw));
}

// ------------------------------ RebindMask (MaskFromVec)

template <typename TFrom, size_t NFrom, class DTo>
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
  return MFromD<DTo>(m.raw);
}

// ------------------------------ IfThenElse

#define HWY_NEON_BUILD_TPL_HWY_IF
#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                         \
  const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
      const Vec128<type##_t, size> no
#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw

HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)

#undef HWY_NEON_BUILD_TPL_HWY_IF
#undef HWY_NEON_BUILD_RET_HWY_IF
#undef HWY_NEON_BUILD_PARAM_HWY_IF
#undef HWY_NEON_BUILD_ARG_HWY_IF

// mask ? yes : 0
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}

// mask ? 0 : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
                                        Vec128<T, N> no) {
  static_assert(IsSigned<T>(), "Only works for signed/float");
  const DFromV<decltype(no)> d;
  const RebindToSigned<decltype(d)> di;

  Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
  return IfThenElse(m, yes, no);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
  const auto zero = Zero(DFromV<decltype(v)>());
  return Max(zero, v);
}

// ------------------------------ Mask logical

template <typename T, size_t N>
HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
  return MaskFromVec(Not(VecFromMask(DFromM<decltype(m)>(), m)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
  const DFromM<decltype(a)> d;
  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
  const DFromM<decltype(a)> d;
  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
  const DFromM<decltype(a)> d;
  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  const DFromM<decltype(a)> d;
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
  const DFromM<decltype(a)> d;
  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}

// ================================================== COMPARE

// Comparisons fill a lane with 1-bits if the condition is true, else 0.

// ------------------------------ Shuffle2301 (for i64 compares)

// Swap 32-bit halves in 64-bits
HWY_API Vec64<uint32_t> Shuffle2301(const Vec64<uint32_t> v) {
  return Vec64<uint32_t>(vrev64_u32(v.raw));
}
HWY_API Vec64<int32_t> Shuffle2301(const Vec64<int32_t> v) {
  return Vec64<int32_t>(vrev64_s32(v.raw));
}
HWY_API Vec64<float> Shuffle2301(const Vec64<float> v) {
  return Vec64<float>(vrev64_f32(v.raw));
}
HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
  return Vec128<uint32_t>(vrev64q_u32(v.raw));
}
HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
  return Vec128<int32_t>(vrev64q_s32(v.raw));
}
HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
  return Vec128<float>(vrev64q_f32(v.raw));
}

#define HWY_NEON_BUILD_TPL_HWY_COMPARE
#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw

// ------------------------------ Equality
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
#else
// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
#endif

// ------------------------------ Strict inequality (signed, float)
#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
#else
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
#endif
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)

// ------------------------------ Weak inequality (float)
#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE)
#else
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE)
#endif
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)

#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
#undef HWY_NEON_BUILD_RET_HWY_COMPARE
#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
#undef HWY_NEON_BUILD_ARG_HWY_COMPARE

// ------------------------------ Armv7 i64 compare (Shuffle2301, Eq)

#if HWY_ARCH_ARM_V7

template <size_t N>
HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
                                       const Vec128<int64_t, N> b) {
  const Simd<int32_t, N * 2, 0> d32;
  const Simd<int64_t, N, 0> d64;
  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
  return MaskFromVec(BitCast(d64, cmp64));
}

template <size_t N>
HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
                                        const Vec128<uint64_t, N> b) {
  const Simd<uint32_t, N * 2, 0> d32;
  const Simd<uint64_t, N, 0> d64;
  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
  return MaskFromVec(BitCast(d64, cmp64));
}

HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
                                   const Vec128<int64_t> b) {
  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
}
HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
                                      const Vec64<int64_t> b) {
  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
  return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
}

template <size_t N>
HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
                                       const Vec128<uint64_t, N> b) {
  const DFromV<decltype(a)> du;
  const RebindToSigned<decltype(du)> di;
  const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
  return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
}

template <size_t N>
HWY_API Mask128<int64_t, N> operator<=(const Vec128<int64_t, N> a,
                                       const Vec128<int64_t, N> b) {
  return Not(b < a);
}

template <size_t N>
HWY_API Mask128<uint64_t, N> operator<=(const Vec128<uint64_t, N> a,
                                        const Vec128<uint64_t, N> b) {
  return Not(b < a);
}

#endif

// ------------------------------ operator!= (operator==)

// Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
#pragma push_macro("HWY_NEON_DEF_FUNCTION")
#undef HWY_NEON_DEF_FUNCTION
// This cannot have _any_ template argument (in x86_128 we can at least have N
// as an argument), otherwise it is not more specialized than rewritten
// operator== in C++20, leading to compile errors.
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
  HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a,             \
                                       Vec128<type##_t, size> b) {           \
    return Not(a == b);                                                      \
  }

HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)

#pragma pop_macro("HWY_NEON_DEF_FUNCTION")

// ------------------------------ Reversed comparisons

template <typename T, size_t N>
HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
  return operator<(b, a);
}
template <typename T, size_t N>
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
  return operator<=(b, a);
}

// ------------------------------ FirstN (Iota, Lt)

template <class D>
HWY_API MFromD<D> FirstN(D d, size_t num) {
  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
  using TI = TFromD<decltype(di)>;
  return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
}

// ------------------------------ TestBit (Eq)

#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
  Vec128<type##_t, size> v, Vec128<type##_t, size> bit
#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw

#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
#else
// No 64-bit versions on armv7
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)

template <size_t N>
HWY_API Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
                                     Vec128<uint64_t, N> bit) {
  return (v & bit) == bit;
}
template <size_t N>
HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
                                    Vec128<int64_t, N> bit) {
  return (v & bit) == bit;
}

#endif
#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT

// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) {
#if HWY_ARCH_ARM_A64
  return Vec128<int64_t>(vabsq_s64(v.raw));
#else
  const auto zero = Zero(DFromV<decltype(v)>());
  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
#endif
}
HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
#if HWY_ARCH_ARM_A64
  return Vec64<int64_t>(vabs_s64(v.raw));
#else
  const auto zero = Zero(DFromV<decltype(v)>());
  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
#endif
}

// ------------------------------ Min (IfThenElse, BroadcastSignBit)

// Unsigned
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)

template <size_t N>
HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
#if HWY_ARCH_ARM_A64
  return IfThenElse(b < a, b, a);
#else
  const DFromV<decltype(a)> du;
  const RebindToSigned<decltype(du)> di;
  return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b)));
#endif
}

// Signed
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2)

template <size_t N>
HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
#if HWY_ARCH_ARM_A64
  return IfThenElse(b < a, b, a);
#else
  const Vec128<int64_t, N> sign = SaturatedSub(a, b);
  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
#endif
}

// Float: IEEE minimumNumber on v8
#if HWY_ARCH_ARM_A64

HWY_NEON_DEF_FUNCTION_FLOAT_32(Min, vminnm, _, 2)

// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define
// in terms of the 128-bit intrinsic.
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
namespace detail {

template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
HWY_INLINE V F64Vec64Min(V a, V b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b)));
}

}  // namespace detail
#endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700

HWY_API Vec64<double> Min(Vec64<double> a, Vec64<double> b) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
  return detail::F64Vec64Min(a, b);
#else
  return Vec64<double>(vminnm_f64(a.raw, b.raw));
#endif
}

HWY_API Vec128<double> Min(Vec128<double> a, Vec128<double> b) {
  return Vec128<double>(vminnmq_f64(a.raw, b.raw));
}

#else
// Armv7: NaN if any is NaN.
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
#endif  // HWY_ARCH_ARM_A64

// ------------------------------ Max (IfThenElse, BroadcastSignBit)

// Unsigned (no u64)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2)

template <size_t N>
HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
#if HWY_ARCH_ARM_A64
  return IfThenElse(b < a, a, b);
#else
  const DFromV<decltype(a)> du;
  const RebindToSigned<decltype(du)> di;
  return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b)));
#endif
}

// Signed (no i64)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2)

template <size_t N>
HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
#if HWY_ARCH_ARM_A64
  return IfThenElse(b < a, a, b);
#else
  const Vec128<int64_t, N> sign = SaturatedSub(a, b);
  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
#endif
}

// Float: IEEE minimumNumber on v8
#if HWY_ARCH_ARM_A64

HWY_NEON_DEF_FUNCTION_FLOAT_32(Max, vmaxnm, _, 2)

// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define
// in terms of the 128-bit intrinsic.
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
namespace detail {

template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
HWY_INLINE V F64Vec64Max(V a, V b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b)));
}

}  // namespace detail
#endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700

HWY_API Vec64<double> Max(Vec64<double> a, Vec64<double> b) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
  return detail::F64Vec64Max(a, b);
#else
  return Vec64<double>(vmaxnm_f64(a.raw, b.raw));
#endif
}

HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) {
  return Vec128<double>(vmaxnmq_f64(a.raw, b.raw));
}

#else
// Armv7: NaN if any is NaN.
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
#endif  // HWY_ARCH_ARM_A64

// ================================================== MEMORY

// ------------------------------ Load 128

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> LoadU(D /* tag */,
                              const uint8_t* HWY_RESTRICT unaligned) {
  return Vec128<uint8_t>(vld1q_u8(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> LoadU(D /* tag */,
                               const uint16_t* HWY_RESTRICT unaligned) {
  return Vec128<uint16_t>(vld1q_u16(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> LoadU(D /* tag */,
                               const uint32_t* HWY_RESTRICT unaligned) {
  return Vec128<uint32_t>(vld1q_u32(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API Vec128<uint64_t> LoadU(D /* tag */,
                               const uint64_t* HWY_RESTRICT unaligned) {
  return Vec128<uint64_t>(vld1q_u64(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
HWY_API Vec128<int8_t> LoadU(D /* tag */,
                             const int8_t* HWY_RESTRICT unaligned) {
  return Vec128<int8_t>(vld1q_s8(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> LoadU(D /* tag */,
                              const int16_t* HWY_RESTRICT unaligned) {
  return Vec128<int16_t>(vld1q_s16(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> LoadU(D /* tag */,
                              const int32_t* HWY_RESTRICT unaligned) {
  return Vec128<int32_t>(vld1q_s32(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API Vec128<int64_t> LoadU(D /* tag */,
                              const int64_t* HWY_RESTRICT unaligned) {
  return Vec128<int64_t>(vld1q_s64(unaligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
  return Vec128<float>(vld1q_f32(unaligned));
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<double> LoadU(D /* tag */,
                             const double* HWY_RESTRICT unaligned) {
  return Vec128<double>(vld1q_f64(unaligned));
}
#endif

// ------------------------------ Load 64

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) {
  return Vec64<uint8_t>(vld1_u8(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) {
  return Vec64<uint16_t>(vld1_u16(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API Vec64<uint32_t> LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) {
  return Vec64<uint32_t>(vld1_u32(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
HWY_API Vec64<uint64_t> LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) {
  return Vec64<uint64_t>(vld1_u64(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
HWY_API Vec64<int8_t> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
  return Vec64<int8_t>(vld1_s8(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API Vec64<int16_t> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
  return Vec64<int16_t>(vld1_s16(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) {
  return Vec64<int32_t>(vld1_s32(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
  return Vec64<int64_t>(vld1_s64(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
  return Vec64<float>(vld1_f32(p));
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
HWY_API Vec64<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
  return Vec64<double>(vld1_f64(p));
}
#endif
// ------------------------------ Load 32

// Actual 32-bit broadcast load - used to implement the other lane types
// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
HWY_API Vec32<uint32_t> LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) {
  return Vec32<uint32_t>(vld1_dup_u32(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
HWY_API Vec32<int32_t> LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) {
  return Vec32<int32_t>(vld1_dup_s32(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
  return Vec32<float>(vld1_dup_f32(p));
}

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_NOT_SPECIAL_FLOAT_D(D),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
  const Repartition<uint32_t, decltype(d)> d32;
  uint32_t buf;
  CopyBytes<4>(p, &buf);
  return BitCast(d, LoadU(d32, &buf));
}

// ------------------------------ Load 16

// Actual 16-bit broadcast load - used to implement the other lane types
// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U16_D(D)>
HWY_API VFromD<D> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) {
  return VFromD<D>(vld1_dup_u16(p));
}
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
  return VFromD<D>(vld1_dup_s16(p));
}

// 8-bit x2
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
  const Repartition<uint16_t, decltype(d)> d16;
  uint16_t buf;
  CopyBytes<2>(p, &buf);
  return BitCast(d, LoadU(d16, &buf));
}

// ------------------------------ Load 8
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U8_D(D)>
HWY_API VFromD<D> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) {
  return VFromD<D>(vld1_dup_u8(p));
}
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I8_D(D)>
HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
  return VFromD<D>(vld1_dup_s8(p));
}

// ------------------------------ Load misc

// [b]float16_t use the same Raw as uint16_t, so forward to that.
template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> LoadU(D d, const float16_t* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du16;
  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
  return VFromD<D>(LoadU(du16, pu16).raw);
}
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> LoadU(D d, const bfloat16_t* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du16;
  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
  return VFromD<D>(LoadU(du16, pu16).raw);
}

// On Arm, Load is the same as LoadU.
template <class D>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
  return LoadU(d, p);
}

template <class D>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
                             const TFromD<D>* HWY_RESTRICT aligned) {
  return IfThenElseZero(m, Load(d, aligned));
}

template <class D>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
                               const TFromD<D>* HWY_RESTRICT aligned) {
  return IfThenElse(m, Load(d, aligned), v);
}

// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
  return LoadU(d, p);
}

// ------------------------------ Store 128

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API void StoreU(Vec128<uint8_t> v, D /* tag */,
                    uint8_t* HWY_RESTRICT unaligned) {
  vst1q_u8(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API void StoreU(Vec128<uint16_t> v, D /* tag */,
                    uint16_t* HWY_RESTRICT unaligned) {
  vst1q_u16(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
HWY_API void StoreU(Vec128<uint32_t> v, D /* tag */,
                    uint32_t* HWY_RESTRICT unaligned) {
  vst1q_u32(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API void StoreU(Vec128<uint64_t> v, D /* tag */,
                    uint64_t* HWY_RESTRICT unaligned) {
  vst1q_u64(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
HWY_API void StoreU(Vec128<int8_t> v, D /* tag */,
                    int8_t* HWY_RESTRICT unaligned) {
  vst1q_s8(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
HWY_API void StoreU(Vec128<int16_t> v, D /* tag */,
                    int16_t* HWY_RESTRICT unaligned) {
  vst1q_s16(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
HWY_API void StoreU(Vec128<int32_t> v, D /* tag */,
                    int32_t* HWY_RESTRICT unaligned) {
  vst1q_s32(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
                    int64_t* HWY_RESTRICT unaligned) {
  vst1q_s64(unaligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API void StoreU(Vec128<float> v, D /* tag */,
                    float* HWY_RESTRICT unaligned) {
  vst1q_f32(unaligned, v.raw);
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API void StoreU(Vec128<double> v, D /* tag */,
                    double* HWY_RESTRICT unaligned) {
  vst1q_f64(unaligned, v.raw);
}
#endif

// ------------------------------ Store 64

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API void StoreU(Vec64<uint8_t> v, D /* tag */, uint8_t* HWY_RESTRICT p) {
  vst1_u8(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API void StoreU(Vec64<uint16_t> v, D /* tag */, uint16_t* HWY_RESTRICT p) {
  vst1_u16(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API void StoreU(Vec64<uint32_t> v, D /* tag */, uint32_t* HWY_RESTRICT p) {
  vst1_u32(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
HWY_API void StoreU(Vec64<uint64_t> v, D /* tag */, uint64_t* HWY_RESTRICT p) {
  vst1_u64(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
HWY_API void StoreU(Vec64<int8_t> v, D /* tag */, int8_t* HWY_RESTRICT p) {
  vst1_s8(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API void StoreU(Vec64<int16_t> v, D /* tag */, int16_t* HWY_RESTRICT p) {
  vst1_s16(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API void StoreU(Vec64<int32_t> v, D /* tag */, int32_t* HWY_RESTRICT p) {
  vst1_s32(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
  vst1_s64(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
  vst1_f32(p, v.raw);
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
HWY_API void StoreU(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) {
  vst1_f64(p, v.raw);
}
#endif

// ------------------------------ Store 32

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
HWY_API void StoreU(Vec32<uint32_t> v, D, uint32_t* HWY_RESTRICT p) {
  vst1_lane_u32(p, v.raw, 0);
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
HWY_API void StoreU(Vec32<int32_t> v, D, int32_t* HWY_RESTRICT p) {
  vst1_lane_s32(p, v.raw, 0);
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) {
  vst1_lane_f32(p, v.raw, 0);
}

// Overload 16-bit types directly to avoid ambiguity with [b]float16_t.
template <class D, HWY_IF_V_SIZE_D(D, 4), typename T = TFromD<D>,
          HWY_IF_T_SIZE(T, 1)>
HWY_API void StoreU(Vec32<T> v, D d, T* HWY_RESTRICT p) {
  Repartition<uint32_t, decltype(d)> d32;
  uint32_t buf = GetLane(BitCast(d32, v));
  CopyBytes<4>(&buf, p);
}

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API void StoreU(Vec32<uint16_t> v, D d, uint16_t* HWY_RESTRICT p) {
  Repartition<uint32_t, decltype(d)> d32;
  uint32_t buf = GetLane(BitCast(d32, v));
  CopyBytes<4>(&buf, p);
}

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
HWY_API void StoreU(Vec32<int16_t> v, D d, int16_t* HWY_RESTRICT p) {
  Repartition<uint32_t, decltype(d)> d32;
  uint32_t buf = GetLane(BitCast(d32, v));
  CopyBytes<4>(&buf, p);
}

// ------------------------------ Store 16

template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U16_D(D)>
HWY_API void StoreU(Vec16<uint16_t> v, D, uint16_t* HWY_RESTRICT p) {
  vst1_lane_u16(p, v.raw, 0);
}
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) {
  vst1_lane_s16(p, v.raw, 0);
}

template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
  const Repartition<uint16_t, decltype(d)> d16;
  const uint16_t buf = GetLane(BitCast(d16, v));
  CopyBytes<2>(&buf, p);
}

// ------------------------------ Store 8

template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_U8_D(D)>
HWY_API void StoreU(Vec128<uint8_t, 1> v, D, uint8_t* HWY_RESTRICT p) {
  vst1_lane_u8(p, v.raw, 0);
}
template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_I8_D(D)>
HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) {
  vst1_lane_s8(p, v.raw, 0);
}

// [b]float16_t use the same Raw as uint16_t, so forward to that.
template <class D, HWY_IF_F16_D(D)>
HWY_API void StoreU(VFromD<D> v, D d, float16_t* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du16;
  const auto pu16 = reinterpret_cast<uint16_t*>(p);
  return StoreU(BitCast(du16, v), du16, pu16);
}
template <class D, HWY_IF_BF16_D(D)>
HWY_API void StoreU(VFromD<D> v, D d, bfloat16_t* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du16;
  const auto pu16 = reinterpret_cast<uint16_t*>(p);
  return StoreU(BitCast(du16, v), du16, pu16);
}

HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
#endif

// On Arm, Store is the same as StoreU.
template <class D>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
  StoreU(v, d, aligned);
}

HWY_DIAGNOSTICS(pop)

template <class D>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                          TFromD<D>* HWY_RESTRICT p) {
  // Treat as unsigned so that we correctly support float16.
  const RebindToUnsigned<decltype(d)> du;
  const auto blended =
      IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
  StoreU(BitCast(d, blended), d, p);
}

// ------------------------------ Non-temporal stores

// Same as aligned stores on non-x86.

template <class D>
HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
#if HWY_ARCH_ARM_A64
#if HWY_COMPILER_GCC
  __builtin_prefetch(aligned, 1, 0);
#elif HWY_COMPILER_MSVC
  __prefetch2(aligned, 0x11);
#endif
#endif
  Store(v, d, aligned);
}

// ================================================== CONVERT

// ------------------------------ ConvertTo

template <class D, HWY_IF_F32_D(D)>
HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<int32_t> v) {
  return Vec128<float>(vcvtq_f32_s32(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToSigned<D>> v) {
  return VFromD<D>(vcvt_f32_s32(v.raw));
}

template <class D, HWY_IF_F32_D(D)>
HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<uint32_t> v) {
  return Vec128<float>(vcvtq_f32_u32(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) {
  return VFromD<D>(vcvt_f32_u32(v.raw));
}

// Truncates (rounds toward zero).
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> ConvertTo(D /* tag */, Vec128<float> v) {
  return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
  return VFromD<D>(vcvt_s32_f32(v.raw));
}

#if HWY_ARCH_ARM_A64

template <class D, HWY_IF_F64_D(D)>
HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<int64_t> v) {
  return Vec128<double>(vcvtq_f64_s64(v.raw));
}
template <class D, HWY_IF_F64_D(D)>
HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
  return Set(Full64<double>(), static_cast<double>(GetLane(v)));
#else
  return Vec64<double>(vcvt_f64_s64(v.raw));
#endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
}

template <class D, HWY_IF_F64_D(D)>
HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
  return Vec128<double>(vcvtq_f64_u64(v.raw));
}
template <class D, HWY_IF_F64_D(D)>
HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
  return Vec64<double>(vcvt_f64_u64(v.raw));
}

// Truncates (rounds toward zero).
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec128<int64_t> ConvertTo(D /* tag */, Vec128<double> v) {
  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec64<int64_t> ConvertTo(D di, Vec64<double> v) {
  // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the
  // 128-bit version to avoid UB from casting double -> int64_t.
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
  const Full128<double> ddt;
  const Twice<decltype(di)> dit;
  return LowerHalf(di, ConvertTo(dit, Combine(ddt, v, v)));
#else
  (void)di;
  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
#endif
}

#endif

// ------------------------------ PromoteTo (ConvertTo)

// Unsigned: zero-extend to full vector.
template <class D, HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> PromoteTo(D /* tag */, Vec64<uint8_t> v) {
  return Vec128<uint16_t>(vmovl_u8(v.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec32<uint8_t> v) {
  uint16x8_t a = vmovl_u8(v.raw);
  return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec64<uint16_t> v) {
  return Vec128<uint32_t>(vmovl_u16(v.raw));
}
template <class D, HWY_IF_U64_D(D)>
HWY_API Vec128<uint64_t> PromoteTo(D /* tag */, Vec64<uint32_t> v) {
  return Vec128<uint64_t>(vmovl_u32(v.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> PromoteTo(D d, Vec64<uint8_t> v) {
  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> PromoteTo(D d, Vec32<uint8_t> v) {
  uint16x8_t a = vmovl_u8(v.raw);
  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> PromoteTo(D d, Vec64<uint16_t> v) {
  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
}
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec128<int64_t> PromoteTo(D d, Vec64<uint32_t> v) {
  return BitCast(d, Vec128<uint64_t>(vmovl_u32(v.raw)));
}

// Unsigned: zero-extend to half vector.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
  return VFromD<D>(vget_low_u16(vmovl_u8(v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
  return VFromD<D>(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw)))));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  return VFromD<D>(vget_low_u32(vmovl_u16(v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  return VFromD<D>(vget_low_u64(vmovl_u32(v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
  using VU16 = VFromD<RebindToUnsigned<D>>;
  return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw))));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
  const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw)));
  return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(u32)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw))));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
  using DU = RebindToUnsigned<D>;
  return BitCast(d, VFromD<DU>(vget_low_u64(vmovl_u32(v.raw))));
}

// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to
// TFromD<D>
template <class D, class V, HWY_IF_UI64_D(D),
          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> PromoteTo(D d, V v) {
  const Rebind<uint32_t, decltype(d)> du32;
  return PromoteTo(d, PromoteTo(du32, v));
}

// Signed: replicate sign bit to full vector.
template <class D, HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> PromoteTo(D /* tag */, Vec64<int8_t> v) {
  return Vec128<int16_t>(vmovl_s8(v.raw));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec32<int8_t> v) {
  int16x8_t a = vmovl_s8(v.raw);
  return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec64<int16_t> v) {
  return Vec128<int32_t>(vmovl_s16(v.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec128<int64_t> PromoteTo(D /* tag */, Vec64<int32_t> v) {
  return Vec128<int64_t>(vmovl_s32(v.raw));
}

// Signed: replicate sign bit to half vector.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
  return VFromD<D>(vget_low_s16(vmovl_s8(v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
  return VFromD<D>(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw)))));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  return VFromD<D>(vget_low_s32(vmovl_s16(v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  return VFromD<D>(vget_low_s64(vmovl_s32(v.raw)));
}

// I8/I16 to I64: First, promote to I32, and then promote to I64
template <class D, class V, HWY_IF_I64_D(D),
          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> PromoteTo(D d, V v) {
  const Rebind<int32_t, decltype(d)> di32;
  return PromoteTo(d, PromoteTo(di32, v));
}

#if __ARM_FP & 2

template <class D, HWY_IF_F32_D(D)>
HWY_API Vec128<float> PromoteTo(D /* tag */, Vec64<float16_t> v) {
  return Vec128<float>(vcvt_f32_f16(vreinterpret_f16_u16(v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
  return VFromD<D>(vget_low_f32(vcvt_f32_f16(vreinterpret_f16_u16(v.raw))));
}

#else

template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
  const Rebind<uint16_t, decltype(df32)> du16;
  const RebindToSigned<decltype(df32)> di32;
  const RebindToUnsigned<decltype(df32)> du32;
  using VU32 = VFromD<decltype(du32)>;
  // Expand to u32 so we can shift.
  const VU32 bits16 = PromoteTo(du32, BitCast(du16, v));
  const VU32 sign = ShiftRight<15>(bits16);
  const VU32 biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
  const VU32 mantissa = bits16 & Set(du32, 0x3FF);
  const VU32 subnormal =
      BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
                        Set(df32, 1.0f / 16384 / 1024));

  const VU32 biased_exp32 = biased_exp + Set(du32, 127 - 15);
  const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa);
  const VU32 normal = ShiftLeft<23>(biased_exp32) | mantissa32;
  const VU32 bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
}

#endif

#if HWY_ARCH_ARM_A64

template <class D, HWY_IF_F64_D(D)>
HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<float> v) {
  return Vec128<double>(vcvt_f64_f32(v.raw));
}

template <class D, HWY_IF_F64_D(D)>
HWY_API Vec64<double> PromoteTo(D /* tag */, Vec32<float> v) {
  return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
}

template <class D, HWY_IF_F64_D(D)>
HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<int32_t> v) {
  const int64x2_t i64 = vmovl_s32(v.raw);
  return Vec128<double>(vcvtq_f64_s64(i64));
}

template <class D, HWY_IF_F64_D(D)>
HWY_API Vec64<double> PromoteTo(D d, Vec32<int32_t> v) {
  return ConvertTo(d, Vec64<int64_t>(vget_low_s64(vmovl_s32(v.raw))));
}

#endif

// ------------------------------ DemoteTo (ConvertTo)

// From full vector to half or quarter
template <class D, HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
  return Vec64<uint16_t>(vqmovun_s32(v.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_API Vec64<int16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
  return Vec64<int16_t>(vqmovn_s32(v.raw));
}
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
  const uint16x4_t a = vqmovun_s32(v.raw);
  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
}
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) {
  return Vec64<uint8_t>(vqmovun_s16(v.raw));
}
template <class D, HWY_IF_I8_D(D)>
HWY_API Vec32<int8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
  const int16x4_t a = vqmovn_s32(v.raw);
  return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
}
template <class D, HWY_IF_I8_D(D)>
HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) {
  return Vec64<int8_t>(vqmovn_s16(v.raw));
}
template <class D, HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) {
  return Vec64<uint16_t>(vqmovn_u32(v.raw));
}
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) {
  const uint16x4_t a = vqmovn_u32(v.raw);
  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
}
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<uint16_t> v) {
  return Vec64<uint8_t>(vqmovn_u16(v.raw));
}

// From half vector to partial half
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  return VFromD<D>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  return VFromD<D>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
  return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  return VFromD<D>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
  return VFromD<D>(vqmovn_s16(vcombine_s16(a, a)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  return VFromD<D>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  return VFromD<D>(vqmovn_u32(vcombine_u32(v.raw, v.raw)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw));
  return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  return VFromD<D>(vqmovn_u16(vcombine_u16(v.raw, v.raw)));
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) {
  return Vec64<int32_t>(vqmovn_s64(v.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) {
  return Vec64<uint32_t>(vqmovun_s64(v.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<uint64_t> v) {
  return Vec64<uint32_t>(vqmovn_u64(v.raw));
}
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_SIGNED_D(D)>
HWY_API VFromD<D> DemoteTo(D d, Vec128<uint64_t> v) {
  const Rebind<int32_t, D> di32;
  return DemoteTo(d, DemoteTo(di32, v));
}
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_UNSIGNED_D(D)>
HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) {
  const Rebind<uint32_t, D> du32;
  return DemoteTo(d, DemoteTo(du32, v));
}
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_UNSIGNED_D(D)>
HWY_API VFromD<D> DemoteTo(D d, Vec128<uint64_t> v) {
  const Rebind<uint32_t, D> du32;
  return DemoteTo(d, DemoteTo(du32, v));
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) {
  return Vec32<int32_t>(vqmovn_s64(vcombine_s64(v.raw, v.raw)));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) {
  return Vec32<uint32_t>(vqmovun_s64(vcombine_s64(v.raw, v.raw)));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<uint64_t> v) {
  return Vec32<uint32_t>(vqmovn_u64(vcombine_u64(v.raw, v.raw)));
}
template <class D, HWY_IF_SIGNED_D(D),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) {
  const Rebind<int32_t, D> di32;
  return DemoteTo(d, DemoteTo(di32, v));
}
template <class D, HWY_IF_UNSIGNED_D(D),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) {
  const Rebind<uint32_t, D> du32;
  return DemoteTo(d, DemoteTo(du32, v));
}
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_UNSIGNED_D(D),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) {
  const Rebind<uint32_t, D> du32;
  return DemoteTo(d, DemoteTo(du32, v));
}

#if __ARM_FP & 2

template <class D, HWY_IF_F16_D(D)>
HWY_API Vec64<float16_t> DemoteTo(D /* tag */, Vec128<float> v) {
  return Vec64<float16_t>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
  return VFromD<D>(vreinterpret_u16_f16(f16));
}

#else

template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
  const RebindToUnsigned<decltype(df16)> du16;
  const Rebind<uint32_t, decltype(du16)> du;
  const RebindToSigned<decltype(du)> di;
  const auto bits32 = BitCast(du, v);
  const auto sign = ShiftRight<31>(bits32);
  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);

  const auto k15 = Set(di, 15);
  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
  const auto is_tiny = exp < Set(di, -24);

  const auto is_subnormal = exp < Set(di, -14);
  const auto biased_exp16 =
      BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
  const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
                     (mantissa32 >> (Set(du, 13) + sub_exp));
  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
                                     ShiftRight<13>(mantissa32));  // <1024

  const auto sign16 = ShiftLeft<15>(sign);
  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
  return VFromD<D>(DemoteTo(du16, bits16).raw);
}

#endif

template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
  const Rebind<int32_t, decltype(dbf16)> di32;
  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
  const Rebind<uint16_t, decltype(dbf16)> du16;
  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
}

#if HWY_ARCH_ARM_A64

template <class D, HWY_IF_F32_D(D)>
HWY_API Vec64<float> DemoteTo(D /* tag */, Vec128<double> v) {
  return Vec64<float>(vcvt_f32_f64(v.raw));
}
template <class D, HWY_IF_F32_D(D)>
HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) {
  const int64x2_t i64 = vcvtq_s64_f64(v.raw);
  return Vec64<int32_t>(vqmovn_s64(i64));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
  // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
  // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
  const Full128<double> ddt;
  const Full128<int64_t> dit;
  return Vec32<int32_t>(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw));
}

#endif

HWY_API Vec32<uint8_t> U8FromU32(Vec128<uint32_t> v) {
  const uint8x16_t org_v = detail::BitCastToByte(v).raw;
  const uint8x16_t w = vuzp1q_u8(org_v, org_v);
  return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
}
template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
  const uint8x8_t org_v = detail::BitCastToByte(v).raw;
  const uint8x8_t w = vuzp1_u8(org_v, org_v);
  return Vec128<uint8_t, N>(vuzp1_u8(w, w));
}

// ------------------------------ Round (IfThenElse, mask, logical)

#if HWY_ARCH_ARM_A64
// Toward nearest integer
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)

// Toward zero, aka truncate
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1)

// Toward +infinity, aka ceiling
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1)

// Toward -infinity, aka floor
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1)
#else

// ------------------------------ Trunc

// Armv7 only supports truncation to integer. We can either convert back to
// float (3 floating-point and 2 logic operations) or manipulate the binary32
// representation, clearing the lowest 23-exp mantissa bits. This requires 9
// integer operations and 3 constants, which is likely more expensive.

namespace detail {

// The original value is already the desired result if NaN or the magnitude is
// large (i.e. the value is already an integer).
template <size_t N>
HWY_INLINE Mask128<float, N> UseInt(const Vec128<float, N> v) {
  return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>());
}

}  // namespace detail

template <size_t N>
HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;

  const auto integer = ConvertTo(di, v);  // round toward 0
  const auto int_f = ConvertTo(df, integer);

  return IfThenElse(detail::UseInt(v), int_f, v);
}

template <size_t N>
HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
  const DFromV<decltype(v)> df;

  // Armv7 also lacks a native NearestInt, but we can instead rely on rounding
  // (we assume the current mode is nearest-even) after addition with a large
  // value such that no mantissa bits remain. We may need a compiler flag for
  // precise floating-point to prevent this from being "optimized" out.
  const auto max = Set(df, MantissaEnd<float>());
  const auto large = CopySignToAbs(max, v);
  const auto added = large + v;
  const auto rounded = added - large;

  // Keep original if NaN or the magnitude is large (already an int).
  return IfThenElse(Abs(v) < max, rounded, v);
}

template <size_t N>
HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;

  const auto integer = ConvertTo(di, v);  // round toward 0
  const auto int_f = ConvertTo(df, integer);

  // Truncating a positive non-integer ends up smaller; if so, add 1.
  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));

  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
}

template <size_t N>
HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;

  const auto integer = ConvertTo(di, v);  // round toward 0
  const auto int_f = ConvertTo(df, integer);

  // Truncating a negative non-integer ends up larger; if so, subtract 1.
  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));

  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
}

#endif

// ------------------------------ NearestInt (Round)

#if HWY_ARCH_ARM_A64

HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
  return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
}

#else

template <size_t N>
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  return ConvertTo(di, Round(v));
}

#endif

// ------------------------------ Floating-point classification
template <typename T, size_t N>
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
  return v != v;
}

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  const VFromD<decltype(di)> vi = BitCast(di, v);
  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
}

// Returns whether normal/subnormal/zero.
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
  const VFromD<decltype(du)> vu = BitCast(du, v);
  // 'Shift left' to clear the sign bit, then right so we can compare with the
  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
  // negative and non-negative floats would be greater).
  const VFromD<decltype(di)> exp =
      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
}

// ================================================== SWIZZLE

// ------------------------------ LowerHalf

// <= 64 bit: just return different type
template <typename T, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
  return Vec128<T, N / 2>(v.raw);
}

HWY_API Vec64<uint8_t> LowerHalf(Vec128<uint8_t> v) {
  return Vec64<uint8_t>(vget_low_u8(v.raw));
}
HWY_API Vec64<uint16_t> LowerHalf(Vec128<uint16_t> v) {
  return Vec64<uint16_t>(vget_low_u16(v.raw));
}
HWY_API Vec64<uint32_t> LowerHalf(Vec128<uint32_t> v) {
  return Vec64<uint32_t>(vget_low_u32(v.raw));
}
HWY_API Vec64<uint64_t> LowerHalf(Vec128<uint64_t> v) {
  return Vec64<uint64_t>(vget_low_u64(v.raw));
}
HWY_API Vec64<int8_t> LowerHalf(Vec128<int8_t> v) {
  return Vec64<int8_t>(vget_low_s8(v.raw));
}
HWY_API Vec64<int16_t> LowerHalf(Vec128<int16_t> v) {
  return Vec64<int16_t>(vget_low_s16(v.raw));
}
HWY_API Vec64<int32_t> LowerHalf(Vec128<int32_t> v) {
  return Vec64<int32_t>(vget_low_s32(v.raw));
}
HWY_API Vec64<int64_t> LowerHalf(Vec128<int64_t> v) {
  return Vec64<int64_t>(vget_low_s64(v.raw));
}
HWY_API Vec64<float> LowerHalf(Vec128<float> v) {
  return Vec64<float>(vget_low_f32(v.raw));
}
#if HWY_ARCH_ARM_A64
HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
  return Vec64<double>(vget_low_f64(v.raw));
}
#endif
HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
  const Full128<uint16_t> du;
  const Full64<bfloat16_t> dbh;
  return BitCast(dbh, LowerHalf(BitCast(du, v)));
}

template <class DH>
HWY_API VFromD<DH> LowerHalf(DH /* tag */, VFromD<Twice<DH>> v) {
  return LowerHalf(v);
}

// ------------------------------ CombineShiftRightBytes

// 128-bit
template <int kBytes, class D, typename T = TFromD<D>>
HWY_API Vec128<T> CombineShiftRightBytes(D d, Vec128<T> hi, Vec128<T> lo) {
  static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
  const Repartition<uint8_t, decltype(d)> d8;
  uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
  return BitCast(d, Vec128<uint8_t>(v8));
}

// 64-bit
template <int kBytes, class D, typename T = TFromD<D>>
HWY_API Vec64<T> CombineShiftRightBytes(D d, Vec64<T> hi, Vec64<T> lo) {
  static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
  const Repartition<uint8_t, decltype(d)> d8;
  uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
  return BitCast(d, VFromD<decltype(d8)>(v8));
}

// <= 32-bit defined after ShiftLeftBytes.

// ------------------------------ Shift vector by constant #bytes

namespace detail {

// Partially specialize because kBytes = 0 and >= size are compile errors;
// callers replace the latter with 0xFF for easier specialization.
template <int kBytes>
struct ShiftLeftBytesT {
  // Full
  template <class T>
  HWY_INLINE Vec128<T> operator()(const Vec128<T> v) {
    const Full128<T> d;
    return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
  }

  // Partial
  template <class T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
    // Expand to 64-bit so we only use the native EXT instruction.
    const Full64<T> d64;
    const auto zero64 = Zero(d64);
    const decltype(zero64) v64(v.raw);
    return Vec128<T, N>(
        CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
  }
};
template <>
struct ShiftLeftBytesT<0> {
  template <class T, size_t N>
  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
    return v;
  }
};
template <>
struct ShiftLeftBytesT<0xFF> {
  template <class T, size_t N>
  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
    return Xor(v, v);
  }
};

template <int kBytes>
struct ShiftRightBytesT {
  template <class T, size_t N>
  HWY_INLINE Vec128<T, N> operator()(Vec128<T, N> v) {
    const DFromV<decltype(v)> d;
    // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
    if (d.MaxBytes() < 8) {
      constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8;
      const Simd<T, kReg / sizeof(T), 0> dreg;
      v = Vec128<T, N>(
          IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
    }
    return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
  }
};
template <>
struct ShiftRightBytesT<0> {
  template <class T, size_t N>
  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
    return v;
  }
};
template <>
struct ShiftRightBytesT<0xFF> {
  template <class T, size_t N>
  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
    return Xor(v, v);
  }
};

}  // namespace detail

template <int kBytes, class D>
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
  return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v);
}

template <int kBytes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
}

template <int kLanes, class D>
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
}

template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
}

// 0x01..0F, kBytes = 1 => 0x0001..0E
template <int kBytes, class D>
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
  return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(
      v);
}

template <int kLanes, class D>
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(
      d, ShiftRightBytes<kLanes * sizeof(TFromD<D>)>(d8, BitCast(d8, v)));
}

// Calls ShiftLeftBytes
template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  constexpr size_t kSize = d.MaxBytes();
  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
  const Repartition<uint8_t, decltype(d)> d8;
  const Full64<uint8_t> d_full8;
  const Repartition<TFromD<D>, decltype(d_full8)> d_full;
  using V64 = VFromD<decltype(d_full8)>;
  const V64 hi64(BitCast(d8, hi).raw);
  // Move into most-significant bytes
  const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
  const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
  // After casting to full 64-bit vector of correct type, shrink to 32-bit
  return VFromD<D>(BitCast(d_full, r).raw);
}

// ------------------------------ UpperHalf (ShiftRightBytes)

// Full input
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> UpperHalf(D /* tag */, Vec128<uint8_t> v) {
  return Vec64<uint8_t>(vget_high_u8(v.raw));
}
template <class D, HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> UpperHalf(D /* tag */, Vec128<uint16_t> v) {
  return Vec64<uint16_t>(vget_high_u16(v.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec64<uint32_t> UpperHalf(D /* tag */, Vec128<uint32_t> v) {
  return Vec64<uint32_t>(vget_high_u32(v.raw));
}
template <class D, HWY_IF_U64_D(D)>
HWY_API Vec64<uint64_t> UpperHalf(D /* tag */, Vec128<uint64_t> v) {
  return Vec64<uint64_t>(vget_high_u64(v.raw));
}
template <class D, HWY_IF_I8_D(D)>
HWY_API Vec64<int8_t> UpperHalf(D /* tag */, Vec128<int8_t> v) {
  return Vec64<int8_t>(vget_high_s8(v.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_API Vec64<int16_t> UpperHalf(D /* tag */, Vec128<int16_t> v) {
  return Vec64<int16_t>(vget_high_s16(v.raw));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> UpperHalf(D /* tag */, Vec128<int32_t> v) {
  return Vec64<int32_t>(vget_high_s32(v.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec64<int64_t> UpperHalf(D /* tag */, Vec128<int64_t> v) {
  return Vec64<int64_t>(vget_high_s64(v.raw));
}
template <class D, HWY_IF_F32_D(D)>
HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
  return Vec64<float>(vget_high_f32(v.raw));
}
#if HWY_ARCH_ARM_A64
template <class D, HWY_IF_F64_D(D)>
HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
  return Vec64<double>(vget_high_f64(v.raw));
}
#endif

template <class D, HWY_IF_BF16_D(D)>
HWY_API Vec64<bfloat16_t> UpperHalf(D dbh, Vec128<bfloat16_t> v) {
  const RebindToUnsigned<decltype(dbh)> duh;
  const Twice<decltype(duh)> du;
  return BitCast(dbh, UpperHalf(duh, BitCast(du, v)));
}

// Partial
template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
HWY_API VFromD<DH> UpperHalf(DH dh, VFromD<Twice<DH>> v) {
  const Twice<DH> d;
  const RebindToUnsigned<decltype(d)> du;
  const VFromD<decltype(du)> upper =
      ShiftRightBytes<dh.MaxBytes()>(du, BitCast(du, v));
  return VFromD<DH>(BitCast(d, upper).raw);
}

// ------------------------------ Broadcast/splat any lane

#if HWY_ARCH_ARM_A64
// Unsigned
template <int kLane>
HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
  return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
  return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
  return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
}
// Vec64<uint64_t> is defined below.

// Signed
template <int kLane>
HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
  return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
  return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
  return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
}
// Vec64<int64_t> is defined below.

// Float
template <int kLane>
HWY_API Vec128<float> Broadcast(Vec128<float> v) {
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
  return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<double> Broadcast(Vec128<double> v) {
  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
  return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
}
template <int kLane>
HWY_API Vec64<double> Broadcast(Vec64<double> v) {
  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
  return v;
}

#else
// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.

// Unsigned
template <int kLane>
HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
  return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
  return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
  return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
}
// Vec64<uint64_t> is defined below.

// Signed
template <int kLane>
HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
  return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
  return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
}
template <int kLane>
HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
  return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
}
// Vec64<int64_t> is defined below.

// Float
template <int kLane>
HWY_API Vec128<float> Broadcast(Vec128<float> v) {
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
  return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
}

#endif

template <int kLane>
HWY_API Vec64<uint64_t> Broadcast(Vec64<uint64_t> v) {
  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
  return v;
}
template <int kLane>
HWY_API Vec64<int64_t> Broadcast(Vec64<int64_t> v) {
  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
  return v;
}

// ------------------------------ TableLookupLanes

// Returned by SetTableIndices for use by TableLookupLanes.
template <typename T, size_t N>
struct Indices128 {
  typename detail::Raw128<T, N>::type raw;
};

namespace detail {

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
    D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  return Iota(d8, 0);
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
    D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
      0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  return Load(d8, kBroadcastLaneBytes);
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
    D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
      0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
  return Load(d8, kBroadcastLaneBytes);
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
    D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
      0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
  return Load(d8, kBroadcastLaneBytes);
}

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  return Zero(d8);
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
  return Load(d8, kByteOffsets);
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
  return Load(d8, kByteOffsets);
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
  return Load(d8, kByteOffsets);
}

}  // namespace detail

template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
    D d, Vec128<TI, MaxLanes(D())> vec) {
  using T = TFromD<D>;
  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  HWY_DASSERT(AllTrue(
      du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
#endif

  (void)d;
  return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw};
}

template <class D, typename TI,
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
    D d, Vec128<TI, MaxLanes(D())> vec) {
  using T = TFromD<D>;
  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  HWY_DASSERT(AllTrue(
      du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
#endif

  const Repartition<uint8_t, decltype(d)> d8;
  using V8 = VFromD<decltype(d8)>;

  // Broadcast each lane index to all bytes of T and shift to bytes
  const V8 lane_indices = TableLookupBytes(
      BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
  constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
  const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
  const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
  return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw};
}

template <class D, typename TI>
HWY_API Indices128<TFromD<D>, MaxLanes(D())> SetTableIndices(D d,
                                                             const TI* idx) {
  const Rebind<TI, decltype(d)> di;
  return IndicesFromVec(d, LoadU(di, idx));
}

template <typename T, size_t N>
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  return BitCast(
      d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
}

template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
                                          Indices128<T, N> idx) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
// TableLookupLanes currently requires table and index vectors to be the same
// size, though a half-length index vector would be sufficient here.
#if HWY_IS_MSAN
  const Vec128<T, N> idx_vec{idx.raw};
  const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
#else
  // We only keep LowerHalf of the result, which is valid in idx.
  const Indices128<T, N * 2> idx2{idx.raw};
#endif
  return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
}

template <typename T>
HWY_API Vec64<T> TwoTablesLookupLanes(Vec64<T> a, Vec64<T> b,
                                      Indices128<T, 8 / sizeof(T)> idx) {
  const DFromV<decltype(a)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  const auto a_u8 = BitCast(du8, a);
  const auto b_u8 = BitCast(du8, b);
  const auto idx_u8 = BitCast(du8, Vec64<T>{idx.raw});

#if HWY_ARCH_ARM_A64
  const Twice<decltype(du8)> dt_u8;
  return BitCast(
      d, Vec64<uint8_t>{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)});
#else
  detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
  return BitCast(d, Vec64<uint8_t>{vtbl2_u8(tup.raw, idx_u8.raw)});
#endif
}

template <typename T>
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
                                       Indices128<T, 16 / sizeof(T)> idx) {
  const DFromV<decltype(a)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  const auto a_u8 = BitCast(du8, a);
  const auto b_u8 = BitCast(du8, b);
  const auto idx_u8 = BitCast(du8, Vec128<T>{idx.raw});

#if HWY_ARCH_ARM_A64
  detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
  return BitCast(d, Vec128<uint8_t>{vqtbl2q_u8(tup.raw, idx_u8.raw)});
#else
  const Half<decltype(d)> dh;
  const Repartition<uint8_t, decltype(dh)> dh_u8;
  const auto a_lo_u8 = LowerHalf(dh_u8, a_u8);
  const auto a_hi_u8 = UpperHalf(dh_u8, a_u8);
  const auto b_lo_u8 = LowerHalf(dh_u8, b_u8);
  const auto b_hi_u8 = UpperHalf(dh_u8, b_u8);
  const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8);
  const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8);

  detail::Tuple4<uint8_t, dh_u8.MaxLanes()> tup = {
      {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}};
  const auto lo_result =
      BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_lo_u8.raw)});
  const auto hi_result =
      BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_hi_u8.raw)});
  return Combine(d, hi_result, lo_result);
#endif
}

// ------------------------------ Reverse2 (CombineShiftRightBytes)

// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
#ifdef HWY_NATIVE_REVERSE2_8
#undef HWY_NATIVE_REVERSE2_8
#else
#define HWY_NATIVE_REVERSE2_8
#endif

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>(vrev16_u8(BitCast(du, v).raw)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Vec128<uint8_t>(vrev16q_u8(BitCast(du, v).raw)));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>(vrev32_u16(BitCast(du, v).raw)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>(vrev64_u32(BitCast(du, v).raw)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  return CombineShiftRightBytes<8>(d, v, v);
}

// ------------------------------ Reverse4 (Reverse2)

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>(vrev32_u8(BitCast(du, v).raw)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Vec128<uint8_t>(vrev32q_u8(BitCast(du, v).raw)));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>(vrev64_u16(BitCast(du, v).raw)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  const RepartitionToWide<RebindToUnsigned<decltype(d)>> duw;
  return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v))));
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D>) {
  HWY_ASSERT(0);  // don't have 8 u64 lanes
}

// ------------------------------ Reverse8 (Reverse2, Reverse4)

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>(vrev64_u8(BitCast(du, v).raw)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> Reverse8(D d, Vec128<T> v) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Vec128<uint8_t>(vrev64q_u8(BitCast(du, v).raw)));
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
  const Repartition<uint64_t, decltype(d)> du64;
  return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v))));
}

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
HWY_API VFromD<D> Reverse8(D, VFromD<D>) {
  HWY_ASSERT(0);  // don't have 8 lanes if larger than 16-bit
}

// ------------------------------ Reverse (Reverse2, Reverse4, Reverse8)

template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
  return v;
}

template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
HWY_API Vec128<T, 2> Reverse(D d, Vec128<T, 2> v) {
  return Reverse2(d, v);
}

template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 4)>
HWY_API Vec128<T, 4> Reverse(D d, Vec128<T, 4> v) {
  return Reverse4(d, v);
}

template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 8)>
HWY_API Vec128<T, 8> Reverse(D d, Vec128<T, 8> v) {
  return Reverse8(d, v);
}

template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 16)>
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
  const Repartition<uint64_t, decltype(d)> du64;
  return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v))));
}

// ------------------------------ ReverseBits

#if HWY_ARCH_ARM_A64

#ifdef HWY_NATIVE_REVERSE_BITS_UI8
#undef HWY_NATIVE_REVERSE_BITS_UI8
#else
#define HWY_NATIVE_REVERSE_BITS_UI8
#endif

HWY_NEON_DEF_FUNCTION_INT_8(ReverseBits, vrbit, _, 1)
HWY_NEON_DEF_FUNCTION_UINT_8(ReverseBits, vrbit, _, 1)

#endif  // HWY_ARCH_ARM_A64

// ------------------------------ Other shuffles (TableLookupBytes)

// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
// Shuffle0321 rotates one lane to the right (the previous least-significant
// lane is now most-significant). These could also be implemented via
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.

// Swap 64-bit halves
template <typename T>
HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
  return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v);
}
template <typename T>
HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
  return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v);
}

// Rotate right 32 bits
template <typename T>
HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
  return CombineShiftRightBytes<4>(DFromV<decltype(v)>(), v, v);
}

// Rotate left 32 bits
template <typename T>
HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
  return CombineShiftRightBytes<12>(DFromV<decltype(v)>(), v, v);
}

// Reverse
template <typename T>
HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
  return Reverse4(DFromV<decltype(v)>(), v);
}

// ------------------------------ InterleaveLower

// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
// the least-significant lane) and "b". To concatenate two half-width integers
// into one, use ZipLower/Upper instead (also works with scalar).
HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2)

#if HWY_ARCH_ARM_A64
// N=1 makes no sense (in that case, there would be no upper/lower).
HWY_API Vec128<uint64_t> InterleaveLower(Vec128<uint64_t> a,
                                         Vec128<uint64_t> b) {
  return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
}
HWY_API Vec128<int64_t> InterleaveLower(Vec128<int64_t> a, Vec128<int64_t> b) {
  return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
}
HWY_API Vec128<double> InterleaveLower(Vec128<double> a, Vec128<double> b) {
  return Vec128<double>(vzip1q_f64(a.raw, b.raw));
}
#else
// Emulated version for Armv7.
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) {
  const DFromV<decltype(a)> d;
  return CombineShiftRightBytes<8>(d, b, Shuffle01(a));
}
#endif

// Floats
HWY_API Vec128<float> InterleaveLower(Vec128<float> a, Vec128<float> b) {
  return Vec128<float>(vzip1q_f32(a.raw, b.raw));
}
template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
                                         Vec128<float, N> b) {
  return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
}

// < 64 bit parts
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw);
}

// Additional overload for the optional Simd<> tag.
template <class D>
HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return InterleaveLower(a, b);
}

// ------------------------------ InterleaveUpper (UpperHalf)

// All functions inside detail lack the required D parameter.
namespace detail {
HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)

#if HWY_ARCH_ARM_A64
// N=1 makes no sense (in that case, there would be no upper/lower).
HWY_API Vec128<uint64_t> InterleaveUpper(Vec128<uint64_t> a,
                                         Vec128<uint64_t> b) {
  return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
}
HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
  return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
}
HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
}
#else
// Emulated version for Armv7.
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> InterleaveUpper(Vec128<T> a, Vec128<T> b) {
  const DFromV<decltype(a)> d;
  return CombineShiftRightBytes<8>(d, Shuffle01(b), a);
}
#endif

HWY_API Vec128<float> InterleaveUpper(Vec128<float> a, Vec128<float> b) {
  return Vec128<float>(vzip2q_f32(a.raw, b.raw));
}
HWY_API Vec64<float> InterleaveUpper(Vec64<float> a, Vec64<float> b) {
  return Vec64<float>(vzip2_f32(a.raw, b.raw));
}

}  // namespace detail

// Full register
template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return detail::InterleaveUpper(a, b);
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
  const Half<decltype(d)> d2;
  const VFromD<D> a2(UpperHalf(d2, a).raw);
  const VFromD<D> b2(UpperHalf(d2, b).raw);
  return InterleaveLower(d, a2, b2);
}

// ------------------------------ ZipLower/ZipUpper (InterleaveLower)

// Same as Interleave*, except that the return lanes are double-width integers;
// this is necessary because the single-lane scalar cannot return two values.
template <class V, class DW = RepartitionToWide<DFromV<V>>>
HWY_API VFromD<DW> ZipLower(V a, V b) {
  return BitCast(DW(), InterleaveLower(a, b));
}
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
  return BitCast(dw, InterleaveLower(D(), a, b));
}

template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
  return BitCast(dw, InterleaveUpper(D(), a, b));
}

// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)

template <class D32, HWY_IF_F32_D(D32),
          class V16 = VFromD<Repartition<bfloat16_t, D32>>>
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
                                              const VFromD<D32> sum0,
                                              VFromD<D32>& sum1) {
  const RebindToUnsigned<decltype(df32)> du32;
  using VU32 = VFromD<decltype(du32)>;
  const VU32 odd = Set(du32, 0xFFFF0000u);
  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
  const VU32 ao = And(BitCast(du32, a), odd);
  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
  const VU32 bo = And(BitCast(du32, b), odd);
  sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
  return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a,
                                                  Vec128<int16_t> b,
                                                  const Vec128<int32_t> sum0,
                                                  Vec128<int32_t>& sum1) {
#if HWY_ARCH_ARM_A64
  sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
#else
  const Full64<int16_t> dh;
  sum1 = Vec128<int32_t>(
      vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
#endif
  return Vec128<int32_t>(
      vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(D d32, Vec64<int16_t> a,
                                                 Vec64<int16_t> b,
                                                 const Vec64<int32_t> sum0,
                                                 Vec64<int32_t>& sum1) {
  // vmlal writes into the upper half, which the caller cannot use, so
  // split into two halves.
  const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
  const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
  sum1 += mul_32;
  return sum0 + LowerHalf(mul_3210);
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(D d32, Vec32<int16_t> a,
                                                 Vec32<int16_t> b,
                                                 const Vec32<int32_t> sum0,
                                                 Vec32<int32_t>& sum1) {
  const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
  const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
  const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
  const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
  sum1 += mul1;
  return sum0 + mul0;
}

// ------------------------------ Combine partial (InterleaveLower)
// < 64bit input, <= 64 bit result
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
  // First double N (only lower halves will be used).
  const VFromD<D> hi2(hi.raw);
  const VFromD<D> lo2(lo.raw);
  // Repartition to two unsigned lanes (each the size of the valid input).
  const Simd<UnsignedFromSize<d.MaxBytes() / 2>, 2, 0> du;
  return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
}

// ------------------------------ RearrangeToOddPlusEven (Combine)

template <size_t N>
HWY_API Vec128<float, N> RearrangeToOddPlusEven(Vec128<float, N> sum0,
                                                Vec128<float, N> sum1) {
  return Add(sum0, sum1);
}

HWY_API Vec128<int32_t> RearrangeToOddPlusEven(Vec128<int32_t> sum0,
                                               Vec128<int32_t> sum1) {
// vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
#if HWY_ARCH_ARM_A64  // pairwise sum is available and what we want
  return Vec128<int32_t>(vpaddq_s32(sum0.raw, sum1.raw));
#else
  const Full128<int32_t> d;
  const Half<decltype(d)> d64;
  const Vec64<int32_t> hi(
      vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw));
  const Vec64<int32_t> lo(
      vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw));
  return Combine(Full128<int32_t>(), hi, lo);
#endif
}

HWY_API Vec64<int32_t> RearrangeToOddPlusEven(Vec64<int32_t> sum0,
                                              Vec64<int32_t> sum1) {
  // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
  return Vec64<int32_t>(vpadd_s32(sum0.raw, sum1.raw));
}

HWY_API Vec32<int32_t> RearrangeToOddPlusEven(Vec32<int32_t> sum0,
                                              Vec32<int32_t> sum1) {
  // Only one widened sum per register, so add them for sum of odd and even.
  return sum0 + sum1;
}

// ------------------------------ WidenMulPairwiseAdd

template <class D32, HWY_IF_F32_D(D32),
          class V16 = VFromD<Repartition<bfloat16_t, D32>>>
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
  const RebindToUnsigned<decltype(df32)> du32;
  using VU32 = VFromD<decltype(du32)>;
  const VU32 odd = Set(du32, 0xFFFF0000u);
  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
  const VU32 ao = And(BitCast(du32, a), odd);
  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
  const VU32 bo = And(BitCast(du32, b), odd);
  return MulAdd(BitCast(df32, ae), BitCast(df32, be),
            Mul(BitCast(df32, ao), BitCast(df32, bo)));
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a,
                                                  Vec128<int16_t> b) {
  Vec128<int32_t> sum1;
#if HWY_ARCH_ARM_A64
  sum1 = Vec128<int32_t>(vmull_high_s16(a.raw, b.raw));
#else
  const Full64<int16_t> dh;
  sum1 = Vec128<int32_t>(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
#endif
  Vec128<int32_t> sum0 = Vec128<int32_t>(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw));
  return RearrangeToOddPlusEven(sum0, sum1);
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> WidenMulPairwiseAdd(D d32, Vec64<int16_t> a,
                                                 Vec64<int16_t> b) {
  // vmlal writes into the upper half, which the caller cannot use, so
  // split into two halves.
  const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
  const Vec64<int32_t> mul0 = LowerHalf(mul_3210);
  const Vec64<int32_t> mul1 = UpperHalf(d32, mul_3210);
  return RearrangeToOddPlusEven(mul0, mul1);
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec32<int32_t> WidenMulPairwiseAdd(D d32, Vec32<int16_t> a,
                                                 Vec32<int16_t> b) {
  const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
  const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
  const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
  const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
  return RearrangeToOddPlusEven(mul0, mul1);
}


// ------------------------------ ZeroExtendVector (Combine)

template <class D>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
  return Combine(d, Zero(Half<decltype(d)>()), lo);
}

// ------------------------------ ConcatLowerLower

// 64 or 128-bit input: just interleave
template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
  // Treat half-width input as a single lane and interleave them.
  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
  return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
}

namespace detail {
#if HWY_ARCH_ARM_A64
HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2)
HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2)
#else

// vtrn returns a struct with even and odd result.
#define HWY_NEON_BUILD_TPL_HWY_TRN
#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
// Pass raw args so we can accept uint16x2 args, for which there is no
// corresponding uint16x2x2 return type.
#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
  Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
#define HWY_NEON_BUILD_ARG_HWY_TRN a, b

// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
// for full and half vectors.
HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
#endif
}  // namespace detail

// <= 32-bit input/output
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
  // Treat half-width input as two lanes and take every second one.
  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
#if HWY_ARCH_ARM_A64
  return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
#else
  using VU = VFromD<decltype(du)>;
  return BitCast(
      d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
                .val[0]));
#endif
}

// ------------------------------ ConcatUpperUpper

// 64 or 128-bit input: just interleave
template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  // Treat half-width input as a single lane and interleave them.
  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
  return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
}

// <= 32-bit input/output
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  // Treat half-width input as two lanes and take every second one.
  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
#if HWY_ARCH_ARM_A64
  return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
#else
  using VU = VFromD<decltype(du)>;
  return BitCast(
      d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
                .val[1]));
#endif
}

// ------------------------------ ConcatLowerUpper (ShiftLeftBytes)

// 64 or 128-bit input: extract from concatenated
template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  return CombineShiftRightBytes<d.MaxBytes() / 2>(d, hi, lo);
}

// <= 32-bit input/output
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  constexpr size_t kSize = d.MaxBytes();
  const Repartition<uint8_t, decltype(d)> d8;
  const Full64<uint8_t> d8x8;
  const Full64<TFromD<D>> d64;
  using V8x8 = VFromD<decltype(d8x8)>;
  const V8x8 hi8x8(BitCast(d8, hi).raw);
  // Move into most-significant bytes
  const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
  const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
  // Back to original lane type, then shrink N.
  return VFromD<D>(BitCast(d64, r).raw);
}

// ------------------------------ ConcatUpperLower

// Works for all N.
template <class D>
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
}

// ------------------------------ ConcatOdd (InterleaveUpper)

namespace detail {
// There is no vuzpq_u64.
HWY_NEON_DEF_FUNCTION_UIF81632(ConcatEven, vuzp1, _, 2)
HWY_NEON_DEF_FUNCTION_UIF81632(ConcatOdd, vuzp2, _, 2)
}  // namespace detail

// Full/half vector
template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
  return detail::ConcatOdd(lo, hi);
}

// 8-bit x4
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ConcatOdd(D d, Vec32<T> hi, Vec32<T> lo) {
  const Twice<decltype(d)> d2;
  const Repartition<uint16_t, decltype(d2)> dw2;
  const VFromD<decltype(d2)> hi2(hi.raw);
  const VFromD<decltype(d2)> lo2(lo.raw);
  const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
  // vcopy_lane_u16, but that's A64-only.
  return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
}

// Any type x2
template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>>
HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
  return InterleaveUpper(d, lo, hi);
}

// ------------------------------ ConcatEven (InterleaveLower)

// Full/half vector
template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
  return detail::ConcatEven(lo, hi);
}

// 8-bit x4
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ConcatEven(D d, Vec32<T> hi, Vec32<T> lo) {
  const Twice<decltype(d)> d2;
  const Repartition<uint16_t, decltype(d2)> dw2;
  const VFromD<decltype(d2)> hi2(hi.raw);
  const VFromD<decltype(d2)> lo2(lo.raw);
  const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
  // vcopy_lane_u16, but that's A64-only.
  return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
}

// Any type x2
template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>>
HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
  return InterleaveLower(d, lo, hi);
}

// ------------------------------ DupEven (InterleaveLower)

template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
#if HWY_ARCH_ARM_A64
  return detail::InterleaveEven(v, v);
#else
  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
#endif
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
  return InterleaveLower(DFromV<decltype(v)>(), v, v);
}

// ------------------------------ DupOdd (InterleaveUpper)

template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
#if HWY_ARCH_ARM_A64
  return detail::InterleaveOdd(v, v);
#else
  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
#endif
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
}

// ------------------------------ OddEven (IfThenElse)

template <typename T, size_t N>
HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t kBytes[16] = {
      ((0 / sizeof(T)) & 1) ? 0 : 0xFF,  ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((2 / sizeof(T)) & 1) ? 0 : 0xFF,  ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((4 / sizeof(T)) & 1) ? 0 : 0xFF,  ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((6 / sizeof(T)) & 1) ? 0 : 0xFF,  ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((8 / sizeof(T)) & 1) ? 0 : 0xFF,  ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
      ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
  };
  const auto vec = BitCast(d, Load(d8, kBytes));
  return IfThenElse(MaskFromVec(vec), b, a);
}

// ------------------------------ OddEvenBlocks
template <typename T, size_t N>
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
  return even;
}

// ------------------------------ SwapAdjacentBlocks
template <typename T, size_t N>
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
  return v;
}

// ------------------------------ ReverseBlocks
// Single block: no change
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
  return v;
}

// ------------------------------ ReorderDemote2To (OddEven)

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
          class V32 = VFromD<Repartition<float, D>>>
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
  const RebindToUnsigned<decltype(dbf16)> du16;
  return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
}

template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
                                         Vec128<int64_t> b) {
  const Vec64<int32_t> a32(vqmovn_s64(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d32;
  return Vec128<int32_t>(vqmovn_high_s64(a32.raw, b.raw));
#else
  const Vec64<int32_t> b32(vqmovn_s64(b.raw));
  return Combine(d32, b32, a32);
#endif
}

template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a,
                                   VFromD<Repartition<int64_t, D>> b) {
  const Rebind<int64_t, decltype(d32)> dt;
  return DemoteTo(d32, Combine(dt, b, a));
}

template <class D, HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
                                          Vec128<int64_t> b) {
  const Vec64<uint32_t> a32(vqmovun_s64(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d32;
  return Vec128<uint32_t>(vqmovun_high_s64(a32.raw, b.raw));
#else
  const Vec64<uint32_t> b32(vqmovun_s64(b.raw));
  return Combine(d32, b32, a32);
#endif
}

template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a,
                                   VFromD<Repartition<int64_t, D>> b) {
  const Rebind<int64_t, decltype(d32)> dt;
  return DemoteTo(d32, Combine(dt, b, a));
}

template <class D, HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<uint64_t> a,
                                          Vec128<uint64_t> b) {
  const Vec64<uint32_t> a32(vqmovn_u64(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d32;
  return Vec128<uint32_t>(vqmovn_high_u64(a32.raw, b.raw));
#else
  const Vec64<uint32_t> b32(vqmovn_u64(b.raw));
  return Combine(d32, b32, a32);
#endif
}

template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<uint64_t, D>> a,
                                   VFromD<Repartition<uint64_t, D>> b) {
  const Rebind<uint64_t, decltype(d32)> dt;
  return DemoteTo(d32, Combine(dt, b, a));
}

template <class D, HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> ReorderDemote2To(D d16, Vec128<int32_t> a,
                                         Vec128<int32_t> b) {
  const Vec64<int16_t> a16(vqmovn_s32(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d16;
  return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
#else
  const Vec64<int16_t> b16(vqmovn_s32(b.raw));
  return Combine(d16, b16, a16);
#endif
}

template <class D, HWY_IF_I16_D(D)>
HWY_API Vec64<int16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a,
                                        Vec64<int32_t> b) {
  const Full128<int32_t> d32;
  const Vec128<int32_t> ab = Combine(d32, b, a);
  return Vec64<int16_t>(vqmovn_s32(ab.raw));
}

template <class D, HWY_IF_I16_D(D)>
HWY_API Vec32<int16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a,
                                        Vec32<int32_t> b) {
  const Full128<int32_t> d32;
  const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
  return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<int32_t> a,
                                          Vec128<int32_t> b) {
  const Vec64<uint16_t> a16(vqmovun_s32(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d16;
  return Vec128<uint16_t>(vqmovun_high_s32(a16.raw, b.raw));
#else
  const Vec64<uint16_t> b16(vqmovun_s32(b.raw));
  return Combine(d16, b16, a16);
#endif
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a,
                                         Vec64<int32_t> b) {
  const Full128<int32_t> d32;
  const Vec128<int32_t> ab = Combine(d32, b, a);
  return Vec64<uint16_t>(vqmovun_s32(ab.raw));
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a,
                                         Vec32<int32_t> b) {
  const Full128<int32_t> d32;
  const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
  return Vec32<uint16_t>(vqmovun_s32(Combine(d32, ab, ab).raw));
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<uint32_t> a,
                                          Vec128<uint32_t> b) {
  const Vec64<uint16_t> a16(vqmovn_u32(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d16;
  return Vec128<uint16_t>(vqmovn_high_u32(a16.raw, b.raw));
#else
  const Vec64<uint16_t> b16(vqmovn_u32(b.raw));
  return Combine(d16, b16, a16);
#endif
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<uint32_t> a,
                                         Vec64<uint32_t> b) {
  const Full128<uint32_t> d32;
  const Vec128<uint32_t> ab = Combine(d32, b, a);
  return Vec64<uint16_t>(vqmovn_u32(ab.raw));
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<uint32_t> a,
                                         Vec32<uint32_t> b) {
  const Full128<uint32_t> d32;
  const Vec64<uint32_t> ab(vzip1_u32(a.raw, b.raw));
  return Vec32<uint16_t>(vqmovn_u32(Combine(d32, ab, ab).raw));
}

template <class D, HWY_IF_I8_D(D)>
HWY_API Vec128<int8_t> ReorderDemote2To(D d8, Vec128<int16_t> a,
                                        Vec128<int16_t> b) {
  const Vec64<int8_t> a8(vqmovn_s16(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d8;
  return Vec128<int8_t>(vqmovn_high_s16(a8.raw, b.raw));
#else
  const Vec64<int8_t> b8(vqmovn_s16(b.raw));
  return Combine(d8, b8, a8);
#endif
}

template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a,
                                   VFromD<Repartition<int16_t, D>> b) {
  const Rebind<int16_t, decltype(d8)> dt;
  return DemoteTo(d8, Combine(dt, b, a));
}

template <class D, HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<int16_t> a,
                                         Vec128<int16_t> b) {
  const Vec64<uint8_t> a8(vqmovun_s16(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d8;
  return Vec128<uint8_t>(vqmovun_high_s16(a8.raw, b.raw));
#else
  const Vec64<uint8_t> b8(vqmovun_s16(b.raw));
  return Combine(d8, b8, a8);
#endif
}

template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a,
                                   VFromD<Repartition<int16_t, D>> b) {
  const Rebind<int16_t, decltype(d8)> dt;
  return DemoteTo(d8, Combine(dt, b, a));
}

template <class D, HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<uint16_t> a,
                                         Vec128<uint16_t> b) {
  const Vec64<uint8_t> a8(vqmovn_u16(a.raw));
#if HWY_ARCH_ARM_A64
  (void)d8;
  return Vec128<uint8_t>(vqmovn_high_u16(a8.raw, b.raw));
#else
  const Vec64<uint8_t> b8(vqmovn_u16(b.raw));
  return Combine(d8, b8, a8);
#endif
}

template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<uint16_t, D>> a,
                                   VFromD<Repartition<uint16_t, D>> b) {
  const Rebind<uint16_t, decltype(d8)> dt;
  return DemoteTo(d8, Combine(dt, b, a));
}

template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
  return ReorderDemote2To(d, a, b);
}

template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
  return ReorderDemote2To(dbf16, a, b);
}

// ================================================== CRYPTO

// (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
// Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
#if HWY_TARGET == HWY_NEON

#ifdef HWY_NATIVE_AES
#undef HWY_NATIVE_AES
#else
#define HWY_NATIVE_AES
#endif

HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
                                 Vec128<uint8_t> round_key) {
  // NOTE: it is important that AESE and AESMC be consecutive instructions so
  // they can be fused. AESE includes AddRoundKey, which is a different ordering
  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
  // round key (the compiler will hopefully optimize this for multiple rounds).
  return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
         round_key;
}

HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
                                     Vec128<uint8_t> round_key) {
  return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
}

HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
  return Vec128<uint8_t>{vaesimcq_u8(state.raw)};
}

HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
                                    Vec128<uint8_t> round_key) {
  // NOTE: it is important that AESD and AESIMC be consecutive instructions so
  // they can be fused. AESD includes AddRoundKey, which is a different ordering
  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
  // round key (the compiler will hopefully optimize this for multiple rounds).
  return Vec128<uint8_t>(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^
         round_key;
}

HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
                                        Vec128<uint8_t> round_key) {
  return Vec128<uint8_t>(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
}

HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
  return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
}

HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
  return Vec128<uint64_t>(
      (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
}

#endif  // HWY_TARGET == HWY_NEON

// ================================================== MISC

template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
  const Rebind<uint16_t, decltype(df32)> du16;
  const RebindToSigned<decltype(df32)> di32;
  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
}

// ------------------------------ Truncations

template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
          HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo),
          hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr>
HWY_API Vec128<TTo, 1> TruncateTo(DTo /* tag */, Vec128<TFrom, 1> v) {
  const Repartition<TTo, DFromV<decltype(v)>> d;
  return Vec128<TTo, 1>{BitCast(d, v).raw};
}

template <class D, HWY_IF_U8_D(D)>
HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
  const Repartition<uint8_t, DFromV<decltype(v)>> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = detail::ConcatEven(v1, v1);
  const auto v3 = detail::ConcatEven(v2, v2);
  const auto v4 = detail::ConcatEven(v3, v3);
  return LowerHalf(LowerHalf(LowerHalf(v4)));
}

template <class D, HWY_IF_U16_D(D)>
HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
  const Repartition<uint16_t, DFromV<decltype(v)>> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = detail::ConcatEven(v1, v1);
  const auto v3 = detail::ConcatEven(v2, v2);
  return LowerHalf(LowerHalf(v3));
}

template <class D, HWY_IF_U32_D(D)>
HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
  const Repartition<uint32_t, DFromV<decltype(v)>> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = detail::ConcatEven(v1, v1);
  return LowerHalf(v2);
}

template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  const Repartition<uint8_t, DFromV<decltype(v)>> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = detail::ConcatEven(v1, v1);
  const auto v3 = detail::ConcatEven(v2, v2);
  return LowerHalf(LowerHalf(v3));
}

template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  const Repartition<uint16_t, DFromV<decltype(v)>> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = detail::ConcatEven(v1, v1);
  return LowerHalf(v2);
}

template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  const Repartition<uint8_t, DFromV<decltype(v)>> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = detail::ConcatEven(v1, v1);
  return LowerHalf(v2);
}

// ------------------------------ MulEven (ConcatEven)

// Multiplies even lanes (0, 2 ..) and places the double-wide result into
// even and the upper half into its odd neighbor lane.
HWY_API Vec128<int64_t> MulEven(Vec128<int32_t> a, Vec128<int32_t> b) {
  const DFromV<decltype(a)> d;
  int32x4_t a_packed = ConcatEven(d, a, a).raw;
  int32x4_t b_packed = ConcatEven(d, b, b).raw;
  return Vec128<int64_t>(
      vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
}
HWY_API Vec128<uint64_t> MulEven(Vec128<uint32_t> a, Vec128<uint32_t> b) {
  const DFromV<decltype(a)> d;
  uint32x4_t a_packed = ConcatEven(d, a, a).raw;
  uint32x4_t b_packed = ConcatEven(d, b, b).raw;
  return Vec128<uint64_t>(
      vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
}

template <size_t N>
HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a,
                                             Vec128<int32_t, N> b) {
  const DFromV<decltype(a)> d;
  int32x2_t a_packed = ConcatEven(d, a, a).raw;
  int32x2_t b_packed = ConcatEven(d, b, b).raw;
  return Vec128<int64_t, (N + 1) / 2>(
      vget_low_s64(vmull_s32(a_packed, b_packed)));
}
template <size_t N>
HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
                                              Vec128<uint32_t, N> b) {
  const DFromV<decltype(a)> d;
  uint32x2_t a_packed = ConcatEven(d, a, a).raw;
  uint32x2_t b_packed = ConcatEven(d, b, b).raw;
  return Vec128<uint64_t, (N + 1) / 2>(
      vget_low_u64(vmull_u32(a_packed, b_packed)));
}

HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
  uint64_t hi;
  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
}

HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
  uint64_t hi;
  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
}

// ------------------------------ TableLookupBytes (Combine, LowerHalf)

// Both full
template <typename T, typename TI>
HWY_API Vec128<TI> TableLookupBytes(Vec128<T> bytes, Vec128<TI> from) {
  const DFromV<decltype(from)> d;
  const Repartition<uint8_t, decltype(d)> d8;
#if HWY_ARCH_ARM_A64
  return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
                                               BitCast(d8, from).raw)));
#else
  uint8x16_t table0 = BitCast(d8, bytes).raw;
  uint8x8x2_t table;
  table.val[0] = vget_low_u8(table0);
  table.val[1] = vget_high_u8(table0);
  uint8x16_t idx = BitCast(d8, from).raw;
  uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
#endif
}

// Partial index vector
template <typename T, typename TI, size_t NI, HWY_IF_V_SIZE_LE(TI, NI, 8)>
HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T> bytes, Vec128<TI, NI> from) {
  const Full128<TI> d_full;
  const Vec64<TI> from64(from.raw);
  const auto idx_full = Combine(d_full, from64, from64);
  const auto out_full = TableLookupBytes(bytes, idx_full);
  return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
}

// Partial table vector
template <typename T, size_t N, typename TI, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<TI> TableLookupBytes(Vec128<T, N> bytes, Vec128<TI> from) {
  const Full128<T> d_full;
  return TableLookupBytes(Combine(d_full, bytes, bytes), from);
}

// Partial both
template <typename T, size_t N, typename TI, size_t NI,
          HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_V_SIZE_LE(TI, NI, 8)>
HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
                                        Vec128<TI, NI> from) {
  const DFromV<decltype(bytes)> d;
  const Simd<TI, NI, 0> d_idx;
  const Repartition<uint8_t, decltype(d_idx)> d_idx8;
  // uint8x8
  const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
  const auto from8 = BitCast(d_idx8, from);
  const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
  return BitCast(d_idx, v8);
}

// For all vector widths; Arm anyway zeroes if >= 0x10.
template <class V, class VI>
HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
  return TableLookupBytes(bytes, from);
}

// ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)

#if HWY_TARGET == HWY_NEON
template <uint8_t kRcon>
HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
  alignas(16) static constexpr uint8_t kRconXorMask[16] = {
      0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
  alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
      0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
  const DFromV<decltype(v)> d;
  const Repartition<uint32_t, decltype(d)> du32;
  const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
  const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
  return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
}
#endif  // HWY_TARGET == HWY_NEON

// ------------------------------ Scatter (Store)

template <class D, typename T = TFromD<D>, class VI>
HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) {
  using TI = TFromV<VI>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN T lanes[MaxLanes(d)];
  Store(v, d, lanes);

  HWY_ALIGN TI offset_lanes[MaxLanes(d)];
  Store(offset, Rebind<TI, decltype(d)>(), offset_lanes);

  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
  }
}

template <class D, typename T = TFromD<D>, class VI>
HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) {
  using TI = TFromV<VI>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN T lanes[MaxLanes(d)];
  Store(v, d, lanes);

  HWY_ALIGN TI index_lanes[MaxLanes(d)];
  Store(index, Rebind<TI, decltype(d)>(), index_lanes);

  for (size_t i = 0; i < MaxLanes(d); ++i) {
    base[index_lanes[i]] = lanes[i];
  }
}

// ------------------------------ Gather (Load/Store)

template <class D, typename T = TFromD<D>, class VI>
HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) {
  using TI = TFromV<VI>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN TI offset_lanes[MaxLanes(d)];
  Store(offset, Rebind<TI, decltype(d)>(), offset_lanes);

  HWY_ALIGN T lanes[MaxLanes(d)];
  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
  }
  return Load(d, lanes);
}

template <class D, typename T = TFromD<D>, class VI>
HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) {
  using TI = TFromV<VI>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN TI index_lanes[MaxLanes(d)];
  Store(index, Rebind<TI, decltype(d)>(), index_lanes);

  HWY_ALIGN T lanes[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    lanes[i] = base[index_lanes[i]];
  }
  return Load(d, lanes);
}

// ------------------------------ Reductions

namespace detail {

// N=1 for any T: no-op
template <typename T>
HWY_INLINE T ReduceMin(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
  return GetLane(v);
}
template <typename T>
HWY_INLINE T ReduceMax(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
  return GetLane(v);
}
template <typename T>
HWY_INLINE T ReduceSum(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
  return GetLane(v);
}
template <typename T>
HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
                                   Vec128<T, 1> v) {
  return v;
}
template <typename T>
HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
                                   Vec128<T, 1> v) {
  return v;
}
template <typename T>
HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
                                   Vec128<T, 1> v) {
  return v;
}

// full vectors
#if HWY_ARCH_ARM_A64

#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
  HWY_API type##_t name(hwy::SizeTag<sizeof(type##_t)>,                 \
                        Vec128<type##_t, size> v) {                     \
    return HWY_NEON_EVAL(prefix##infix##suffix, v.raw);                 \
  }

// Note that u64/s64 don't have horizontal min/max for some reason.
#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix)       \
  HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8)       \
  HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8)   \
  HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16)     \
  HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16)  \
  HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32)     \
  HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32)  \
  HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8)        \
  HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8)    \
  HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16)      \
  HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16)   \
  HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32)      \
  HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32)   \
  HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32)    \
  HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \
  HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64)

HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv)
HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv)

// u64/s64 don't have horizontal min/max for some reason, but do have add.
#define HWY_NEON_DEF_REDUCTION_ALL_TYPES(name, prefix)       \
  HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix)            \
  HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \
  HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64)

HWY_NEON_DEF_REDUCTION_ALL_TYPES(ReduceSum, vaddv)

#undef HWY_NEON_DEF_REDUCTION_ALL_TYPES
#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
#undef HWY_NEON_DEF_REDUCTION

// Need some fallback implementations for [ui]64x2 and [ui]16x2.
#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2)
#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))

// Implement Min/Max/SumOfLanes in terms of the corresponding reduction.
template <size_t N, typename V>
HWY_API V MinOfLanes(hwy::SizeTag<N> tag, V v) {
  return Set(DFromV<decltype(v)>(), ReduceMin(tag, v));
}
template <size_t N, typename V>
HWY_API V MaxOfLanes(hwy::SizeTag<N> tag, V v) {
  return Set(DFromV<decltype(v)>(), ReduceMax(tag, v));
}
template <size_t N, typename V>
HWY_API V SumOfLanes(hwy::SizeTag<N> tag, V v) {
  return Set(DFromV<decltype(v)>(), ReduceSum(tag, v));
}

#else

// For arm7, we implement reductions using a series of pairwise operations. This
// produces the full vector result, so we express Reduce* in terms of *OfLanes.
#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix)    \
  HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes(   \
      hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) {            \
    HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
    if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp);                   \
    if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp);                   \
    return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size)(tmp);           \
  }                                                                          \
  HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)> tag,          \
                                Vec128<type##_t, size> v) {                  \
    return GetLane(name##OfLanes(tag, v));                                   \
  }

// For the wide versions, the pairwise operations produce a half-length vector.
// We produce that value with a Reduce*Vector helper method, and express Reduce*
// and *OfLanes in terms of the helper.
#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
                                             suffix)                         \
  HWY_API HWY_NEON_BUILD_TYPE_T(type, half)                                  \
      Reduce##name##Vector(Vec128<type##_t, size> v) {                       \
    HWY_NEON_BUILD_TYPE_T(type, half) tmp;                                   \
    tmp = prefix##_##suffix(vget_high_##suffix(v.raw),                       \
                            vget_low_##suffix(v.raw));                       \
    if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp);                   \
    if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp);                   \
    if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp);                   \
    return tmp;                                                              \
  }                                                                          \
  HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)>,              \
                                Vec128<type##_t, size> v) {                  \
    const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v);   \
    return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0);                        \
  }                                                                          \
  HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes(   \
      hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) {            \
    const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v);   \
    return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(                            \
        type, size)(vcombine_##suffix(tmp, tmp));                            \
  }

#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix)                  \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32)         \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16)         \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8)           \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32)          \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16)          \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8)            \
  HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32)        \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8)  \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32)  \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16)  \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8)   \
  HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32)

HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Sum, vpadd)
HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Min, vpmin)
HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)

#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
#undef HWY_NEON_BUILD_TYPE_T

// Need fallback min/max implementations for [ui]64x2 and [ui]16x2.
#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8)
#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8)

#endif

}  // namespace detail

// [ui]16/[ui]64: N=2 -- special case for pairs of very small or large lanes
template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
HWY_API Vec128<T, 2> SumOfLanes(D /* tag */, Vec128<T, 2> v10) {
  return v10 + Reverse2(Simd<T, 2, 0>(), v10);
}

template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
HWY_API T ReduceSum(D d, Vec128<T, 2> v10) {
  return GetLane(SumOfLanes(d, v10));
}

template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
HWY_API Vec128<T, 2> MinOfLanes(D /* tag */, Vec128<T, 2> v10) {
  return Min(v10, Reverse2(Simd<T, 2, 0>(), v10));
}
template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
HWY_API Vec128<T, 2> MaxOfLanes(D /* tag */, Vec128<T, 2> v10) {
  return Max(v10, Reverse2(Simd<T, 2, 0>(), v10));
}

#undef HWY_IF_SUM_REDUCTION
#undef HWY_IF_MINMAX_REDUCTION

template <class D>
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
  return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
}
template <class D>
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
  return detail::ReduceSum(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
}
template <class D>
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
  return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
}
template <class D>
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
  return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
}

// ------------------------------ LoadMaskBits (TestBit)

namespace detail {

// Helper function to set 64 bits and potentially return a smaller vector. The
// overload is required to call the q vs non-q intrinsics. Note that 8-bit
// LoadMaskBits only requires 16 bits, but 64 avoids casting.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_INLINE VFromD<D> Set64(D /* tag */, uint64_t mask_bits) {
  const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
  return VFromD<D>(BitCast(Full64<TFromD<D>>(), v64).raw);
}
template <typename T>
HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
  return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
}

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  // Easier than Set(), which would require an >8-bit type, which would not
  // compile for T=uint8_t, N=1.
  const auto vmask_bits = Set64(du, mask_bits);

  // Replicate bytes 8x such that each byte contains the bit that governs it.
  alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
                                                    1, 1, 1, 1, 1, 1, 1, 1};
  const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));

  alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
                                                   1, 2, 4, 8, 16, 32, 64, 128};
  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
}

}  // namespace detail

// `p` points to at least 8 readable bytes, not all of which need be valid.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
  uint64_t mask_bits = 0;
  CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits);
  return detail::LoadMaskBits(d, mask_bits);
}

// ------------------------------ Mask

namespace detail {

// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
  const Full128<uint16_t> du16;
  const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
  const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
  return GetLane(BitCast(Full64<uint64_t>(), nib));
}

template <class D, HWY_IF_V_SIZE_D(D, 8)>
HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
  // There is no vshrn_n_u16 for uint16x4, so zero-extend.
  const Twice<decltype(d)> d2;
  const VFromD<decltype(d2)> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
  // No need to mask, upper half is zero thanks to ZeroExtendVector.
  return NibblesFromMask(d2, MaskFromVec(v128));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
  const Mask64<TFromD<D>> mask64(mask.raw);
  const uint64_t nib = NibblesFromMask(Full64<TFromD<D>>(), mask64);
  // Clear nibbles from upper half of 64-bits
  return nib & ((1ull << (d.MaxBytes() * 4)) - 1);
}

template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
  alignas(16) static constexpr uint8_t kSliceLanes[16] = {
      1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
  };
  const Full128<uint8_t> du;
  const Vec128<uint8_t> values =
      BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);

#if HWY_ARCH_ARM_A64
  // Can't vaddv - we need two separate bytes (16 bits).
  const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
  const uint8x8_t x4 = vpadd_u8(x2, x2);
  const uint8x8_t x8 = vpadd_u8(x4, x4);
  return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF;
#else
  // Don't have vpaddq, so keep doubling lane size.
  const uint16x8_t x2 = vpaddlq_u8(values.raw);
  const uint32x4_t x4 = vpaddlq_u16(x2);
  const uint64x2_t x8 = vpaddlq_u32(x4);
  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
#endif
}

template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
  alignas(8) static constexpr uint8_t kSliceLanes[8] = {1,    2,    4,    8,
                                                        0x10, 0x20, 0x40, 0x80};
  const DFromM<decltype(mask)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
  const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;

#if HWY_ARCH_ARM_A64
  return vaddv_u8(values.raw);
#else
  const uint16x4_t x2 = vpaddl_u8(values.raw);
  const uint32x2_t x4 = vpaddl_u16(x2);
  const uint64x1_t x8 = vpaddl_u32(x4);
  return vget_lane_u64(x8, 0);
#endif
}

template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) {
  alignas(16) static constexpr uint16_t kSliceLanes[8] = {
      1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
  const Full128<T> d;
  const Full128<uint16_t> du;
  const Vec128<uint16_t> values =
      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
#if HWY_ARCH_ARM_A64
  return vaddvq_u16(values.raw);
#else
  const uint32x4_t x2 = vpaddlq_u16(values.raw);
  const uint64x2_t x4 = vpaddlq_u32(x2);
  return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
#endif
}

template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
  alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
  const DFromM<decltype(mask)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
  const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
#if HWY_ARCH_ARM_A64
  return vaddv_u16(values.raw);
#else
  const uint32x2_t x2 = vpaddl_u16(values.raw);
  const uint64x1_t x4 = vpaddl_u32(x2);
  return vget_lane_u64(x4, 0);
#endif
}

template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) {
  alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
  const Full128<T> d;
  const Full128<uint32_t> du;
  const Vec128<uint32_t> values =
      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
#if HWY_ARCH_ARM_A64
  return vaddvq_u32(values.raw);
#else
  const uint64x2_t x2 = vpaddlq_u32(values.raw);
  return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
#endif
}

template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
  alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2};
  const DFromM<decltype(mask)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
  const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
#if HWY_ARCH_ARM_A64
  return vaddv_u32(values.raw);
#else
  const uint64x1_t x2 = vpaddl_u32(values.raw);
  return vget_lane_u64(x2, 0);
#endif
}

template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T> m) {
  alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2};
  const Full128<T> d;
  const Full128<uint64_t> du;
  const Vec128<uint64_t> values =
      BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
#if HWY_ARCH_ARM_A64
  return vaddvq_u64(values.raw);
#else
  return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
#endif
}

template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, 1> m) {
  const Full64<T> d;
  const Full64<uint64_t> du;
  const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
  return vget_lane_u64(values.raw, 0);
}

// Returns the lowest N for the BitsFromMask result.
template <typename T, size_t N>
constexpr uint64_t OnlyActive(uint64_t bits) {
  return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
}

template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
}

// Returns number of lanes whose mask is set.
//
// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
// changes each lane to 1 (if mask set) or 0.
// NOTE: PopCount also operates on vectors, so we still have to do horizontal
// sums separately. We specialize CountTrue for full vectors (negating instead
// of PopCount because it avoids an extra shift), and use PopCount of
// NibblesFromMask for partial vectors.

template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
  const Full128<int8_t> di;
  const int8x16_t ones =
      vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);

#if HWY_ARCH_ARM_A64
  return static_cast<size_t>(vaddvq_s8(ones));
#else
  const int16x8_t x2 = vpaddlq_s8(ones);
  const int32x4_t x4 = vpaddlq_s16(x2);
  const int64x2_t x8 = vpaddlq_s32(x4);
  return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
#endif
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) {
  const Full128<int16_t> di;
  const int16x8_t ones =
      vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);

#if HWY_ARCH_ARM_A64
  return static_cast<size_t>(vaddvq_s16(ones));
#else
  const int32x4_t x2 = vpaddlq_s16(ones);
  const int64x2_t x4 = vpaddlq_s32(x2);
  return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
#endif
}

template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) {
  const Full128<int32_t> di;
  const int32x4_t ones =
      vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);

#if HWY_ARCH_ARM_A64
  return static_cast<size_t>(vaddvq_s32(ones));
#else
  const int64x2_t x2 = vpaddlq_s32(ones);
  return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
#endif
}

template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, Mask128<T> mask) {
#if HWY_ARCH_ARM_A64
  const Full128<int64_t> di;
  const int64x2_t ones =
      vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
  return static_cast<size_t>(vaddvq_s64(ones));
#else
  const Full128<uint64_t> du;
  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
  return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
#endif
}

}  // namespace detail

// Full
template <class D, typename T = TFromD<D>>
HWY_API size_t CountTrue(D /* tag */, Mask128<T> mask) {
  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
  constexpr int kDiv = 4 * sizeof(TFromD<D>);
  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
}

template <class D>
HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
  const uint64_t nib = detail::NibblesFromMask(d, mask);
  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
  return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
}

template <class D>
HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
  const uint64_t nib = detail::NibblesFromMask(d, mask);
  if (nib == 0) return -1;
  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
  return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
}

template <class D>
HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
  const uint64_t nib = detail::NibblesFromMask(d, mask);
  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
  return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv;
}

template <class D>
HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
  const uint64_t nib = detail::NibblesFromMask(d, mask);
  if (nib == 0) return -1;
  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
  return static_cast<intptr_t>((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) /
                               kDiv);
}

// `p` points to at least 8 writable bytes.
template <class D>
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
  const uint64_t mask_bits = detail::BitsFromMask(mask);
  const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
  CopyBytes<kNumBytes>(&mask_bits, bits);
  return kNumBytes;
}

template <class D>
HWY_API bool AllFalse(D d, MFromD<D> m) {
  return detail::NibblesFromMask(d, m) == 0;
}

// Full
template <class D, typename T = TFromD<D>>
HWY_API bool AllTrue(D d, Mask128<T> m) {
  return detail::NibblesFromMask(d, m) == ~0ull;
}
// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API bool AllTrue(D d, MFromD<D> m) {
  return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1;
}

// ------------------------------ Compress

template <typename T>
struct CompressIsPartition {
  enum { value = (sizeof(T) != 1) };
};

namespace detail {

// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
  return Vec128<uint8_t>(vreinterpretq_u8_u64(
      vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
}

// Load 8 bytes and return half-reg with N <= 8 bytes.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_INLINE VFromD<D> Load8Bytes(D d, const uint8_t* bytes) {
  return Load(d, bytes);
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
                                    uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 256);
  const Simd<T, N, 0> d;
  const Repartition<uint8_t, decltype(d)> d8;
  const Simd<uint16_t, N, 0> du;

  // NEON does not provide an equivalent of AVX2 permutevar, so we need byte
  // indices for VTBL (one vector's worth for each of 256 combinations of
  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
  // store lane indices and convert to byte indices (2*lane + 0..1), with the
  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
  // is likely more costly than the higher cache footprint from storing bytes.
  alignas(16) static constexpr uint8_t table[256 * 8] = {
      // PrintCompress16x8Tables
      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};

  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
  return BitCast(d, pairs + Set(du, 0x0100));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/,
                                       uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 256);
  const Simd<T, N, 0> d;
  const Repartition<uint8_t, decltype(d)> d8;
  const Simd<uint16_t, N, 0> du;

  // NEON does not provide an equivalent of AVX2 permutevar, so we need byte
  // indices for VTBL (one vector's worth for each of 256 combinations of
  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
  // store lane indices and convert to byte indices (2*lane + 0..1), with the
  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
  // is likely more costly than the higher cache footprint from storing bytes.
  alignas(16) static constexpr uint8_t table[256 * 8] = {
      // PrintCompressNot16x8Tables
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};

  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
  return BitCast(d, pairs + Set(du, 0x0100));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
                                    uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 16);

  // There are only 4 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
      // PrintCompress32x4Tables
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
  const Simd<T, N, 0> d;
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/,
                                       uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 16);

  // There are only 4 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
      // PrintCompressNot32x4Tables
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
      12, 13, 14, 15};
  const Simd<T, N, 0> d;
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
                                    uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 4);

  // There are only 2 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[64] = {
      // PrintCompress64x2Tables
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

  const Simd<T, N, 0> d;
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
                                       uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 4);

  // There are only 2 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
      // PrintCompressNot64x2Tables
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

  const Simd<T, N, 0> d;
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

#endif

// Helper function called by both Compress and CompressStore - avoids a
// redundant BitsFromMask in the latter.
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, uint64_t mask_bits) {
  const auto idx =
      detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
  using D = DFromV<decltype(v)>;
  const RebindToSigned<D> di;
  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, uint64_t mask_bits) {
  const auto idx =
      detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
  using D = DFromV<decltype(v)>;
  const RebindToSigned<D> di;
  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
}

}  // namespace detail

// Single lane: no-op
template <typename T>
HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  return v;
}

// Two lanes: conditional swap
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
  const DFromV<decltype(v)> d;
  const Vec128<T, N> m = VecFromMask(d, mask);
  const Vec128<T, N> maskL = DupEven(m);
  const Vec128<T, N> maskH = DupOdd(m);
  const Vec128<T, N> swap = AndNot(maskL, maskH);
  return IfVecThenElse(swap, Shuffle01(v), v);
}

// General case, 2 or 4 byte lanes
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
  return detail::Compress(v, detail::BitsFromMask(mask));
}

// Single lane: no-op
template <typename T>
HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  return v;
}

// Two lanes: conditional swap
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
  const DFromV<decltype(v)> d;
  const Vec128<T> m = VecFromMask(d, mask);
  const Vec128<T> maskL = DupEven(m);
  const Vec128<T> maskH = DupOdd(m);
  const Vec128<T> swap = AndNot(maskH, maskL);
  return IfVecThenElse(swap, Shuffle01(v), v);
}

// General case, 2 or 4 byte lanes
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
  // For partial vectors, we cannot pull the Not() into the table because
  // BitsFromMask clears the upper bits.
  if (N < 16 / sizeof(T)) {
    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
  }
  return detail::CompressNot(v, detail::BitsFromMask(mask));
}

// ------------------------------ CompressBlocksNot
HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
                                           Mask128<uint64_t> /* m */) {
  return v;
}

// ------------------------------ CompressBits

template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v,
                                     const uint8_t* HWY_RESTRICT bits) {
  uint64_t mask_bits = 0;
  constexpr size_t kNumBytes = (N + 7) / 8;
  CopyBytes<kNumBytes>(bits, &mask_bits);
  if (N < 8) {
    mask_bits &= (1ull << N) - 1;
  }

  return detail::Compress(v, mask_bits);
}

// ------------------------------ CompressStore
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
                             TFromD<D>* HWY_RESTRICT unaligned) {
  const uint64_t mask_bits = detail::BitsFromMask(mask);
  StoreU(detail::Compress(v, mask_bits), d, unaligned);
  return PopCount(mask_bits);
}

// ------------------------------ CompressBlendedStore
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
                                    TFromD<D>* HWY_RESTRICT unaligned) {
  const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
  const uint64_t mask_bits = detail::BitsFromMask(m);
  const size_t count = PopCount(mask_bits);
  const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
  const VFromD<decltype(du)> compressed =
      detail::Compress(BitCast(du, v), mask_bits);
  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
  return count;
}

// ------------------------------ CompressBitsStore

template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
  uint64_t mask_bits = 0;
  constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8;
  CopyBytes<kNumBytes>(bits, &mask_bits);
  if (d.MaxLanes() < 8) {
    mask_bits &= (1ull << d.MaxLanes()) - 1;
  }

  StoreU(detail::Compress(v, mask_bits), d, unaligned);
  return PopCount(mask_bits);
}

// ------------------------------ LoadInterleaved2

// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#else
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
#endif

namespace detail {
#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from

#if HWY_ARCH_ARM_A64
#define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
#else
// Exclude 64x2 and f64x1, which are only supported on aarch64
#define HWY_IF_LOAD_INT(D)                                                 \
  HWY_IF_V_SIZE_GT_D(D, 4),                                                \
      hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
          nullptr
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)    \
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)   \
  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)       \
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
#endif  // HWY_ARCH_ARM_A64

// Must return raw tuple because Tuple2 lack a ctor, and we cannot use
// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
// void.
#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
  decltype(Tuple2<type##_t, size>().raw)
// Tuple tag arg allows overloading (cannot just overload on return type)
#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
  const type##_t *from, Tuple2<type##_t, size>
HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT)
#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT

#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
  decltype(Tuple3<type##_t, size>().raw)
#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
  const type##_t *from, Tuple3<type##_t, size>
HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT)
#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT

#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
  decltype(Tuple4<type##_t, size>().raw)
#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
  const type##_t *from, Tuple4<type##_t, size>
HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT

#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
}  // namespace detail

template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1) {
  auto raw =
      detail::LoadInterleaved2(unaligned, detail::Tuple2<T, d.MaxLanes()>());
  v0 = VFromD<D>(raw.val[0]);
  v1 = VFromD<D>(raw.val[1]);
}

// <= 32 bits: avoid loading more than N bytes by copying to buffer
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1) {
  // The smallest vector registers are 64-bits and we want space for two.
  alignas(16) T buf[2 * 8 / sizeof(T)] = {};
  CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
  auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, d.MaxLanes()>());
  v0 = VFromD<D>(raw.val[0]);
  v1 = VFromD<D>(raw.val[1]);
}

#if HWY_ARCH_ARM_V7
// 64x2: split into two 64x1
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
                              Vec128<T>& v1) {
  const Half<decltype(d)> dh;
  VFromD<decltype(dh)> v00, v10, v01, v11;
  LoadInterleaved2(dh, unaligned, v00, v10);
  LoadInterleaved2(dh, unaligned + 2, v01, v11);
  v0 = Combine(d, v01, v00);
  v1 = Combine(d, v11, v10);
}
#endif  // HWY_ARCH_ARM_V7

// ------------------------------ LoadInterleaved3

template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  auto raw =
      detail::LoadInterleaved3(unaligned, detail::Tuple3<T, d.MaxLanes()>());
  v0 = VFromD<D>(raw.val[0]);
  v1 = VFromD<D>(raw.val[1]);
  v2 = VFromD<D>(raw.val[2]);
}

// <= 32 bits: avoid writing more than N bytes by copying to buffer
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  // The smallest vector registers are 64-bits and we want space for three.
  alignas(16) T buf[3 * 8 / sizeof(T)] = {};
  CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
  auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, d.MaxLanes()>());
  v0 = VFromD<D>(raw.val[0]);
  v1 = VFromD<D>(raw.val[1]);
  v2 = VFromD<D>(raw.val[2]);
}

#if HWY_ARCH_ARM_V7
// 64x2: split into two 64x1
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
  const Half<decltype(d)> dh;
  VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
  LoadInterleaved3(dh, unaligned, v00, v10, v20);
  LoadInterleaved3(dh, unaligned + 3, v01, v11, v21);
  v0 = Combine(d, v01, v00);
  v1 = Combine(d, v11, v10);
  v2 = Combine(d, v21, v20);
}
#endif  // HWY_ARCH_ARM_V7

// ------------------------------ LoadInterleaved4

template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  auto raw =
      detail::LoadInterleaved4(unaligned, detail::Tuple4<T, d.MaxLanes()>());
  v0 = VFromD<D>(raw.val[0]);
  v1 = VFromD<D>(raw.val[1]);
  v2 = VFromD<D>(raw.val[2]);
  v3 = VFromD<D>(raw.val[3]);
}

// <= 32 bits: avoid writing more than N bytes by copying to buffer
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  alignas(16) T buf[4 * 8 / sizeof(T)] = {};
  CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
  auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, d.MaxLanes()>());
  v0 = VFromD<D>(raw.val[0]);
  v1 = VFromD<D>(raw.val[1]);
  v2 = VFromD<D>(raw.val[2]);
  v3 = VFromD<D>(raw.val[3]);
}

#if HWY_ARCH_ARM_V7
// 64x2: split into two 64x1
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
                              Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
                              Vec128<T>& v3) {
  const Half<decltype(d)> dh;
  VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
  LoadInterleaved4(dh, unaligned, v00, v10, v20, v30);
  LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31);
  v0 = Combine(d, v01, v00);
  v1 = Combine(d, v11, v10);
  v2 = Combine(d, v21, v20);
  v3 = Combine(d, v31, v30);
}
#endif  // HWY_ARCH_ARM_V7

#undef HWY_IF_LOAD_INT

// ------------------------------ StoreInterleaved2

namespace detail {
#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw

#if HWY_ARCH_ARM_A64
#define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
#else
// Exclude 64x2 and f64x1, which are only supported on aarch64
#define HWY_IF_STORE_INT(D)                                                \
  HWY_IF_V_SIZE_GT_D(D, 4),                                                \
      hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
          nullptr
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)     \
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)    \
  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)        \
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)  \
  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
#endif  // HWY_ARCH_ARM_A64

#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
  Tuple2<type##_t, size> tup, type##_t *to
HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT)
#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT

#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
  Tuple3<type##_t, size> tup, type##_t *to
HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT)
#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT

#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
  Tuple4<type##_t, size> tup, type##_t *to
HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT)
#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT

#undef HWY_NEON_DEF_FUNCTION_STORE_INT
#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
}  // namespace detail

template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
                               T* HWY_RESTRICT unaligned) {
  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
  detail::StoreInterleaved2(tup, unaligned);
}

// <= 32 bits: avoid writing more than N bytes by copying to buffer
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
                               T* HWY_RESTRICT unaligned) {
  alignas(16) T buf[2 * 8 / sizeof(T)];
  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
  detail::StoreInterleaved2(tup, buf);
  CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
}

#if HWY_ARCH_ARM_V7
// 64x2: split into two 64x1
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
                               T* HWY_RESTRICT unaligned) {
  const Half<decltype(d)> dh;
  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned);
  StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2);
}
#endif  // HWY_ARCH_ARM_V7

// ------------------------------ StoreInterleaved3

template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               T* HWY_RESTRICT unaligned) {
  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
  detail::StoreInterleaved3(tup, unaligned);
}

// <= 32 bits: avoid writing more than N bytes by copying to buffer
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               T* HWY_RESTRICT unaligned) {
  alignas(16) T buf[3 * 8 / sizeof(T)];
  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
  detail::StoreInterleaved3(tup, buf);
  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
}

#if HWY_ARCH_ARM_V7
// 64x2: split into two 64x1
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
                               T* HWY_RESTRICT unaligned) {
  const Half<decltype(d)> dh;
  StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
                    unaligned);
  StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
                    unaligned + 3);
}
#endif  // HWY_ARCH_ARM_V7

// ------------------------------ StoreInterleaved4

template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
                               VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
  detail::StoreInterleaved4(tup, unaligned);
}

// <= 32 bits: avoid writing more than N bytes by copying to buffer
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
                               VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
  alignas(16) T buf[4 * 8 / sizeof(T)];
  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
  detail::StoreInterleaved4(tup, buf);
  CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
}

#if HWY_ARCH_ARM_V7
// 64x2: split into two 64x1
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
                               Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) {
  const Half<decltype(d)> dh;
  StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
                    LowerHalf(dh, v3), dh, unaligned);
  StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
                    UpperHalf(dh, v3), dh, unaligned + 4);
}
#endif  // HWY_ARCH_ARM_V7

#undef HWY_IF_STORE_INT

// ------------------------------ Lt128

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) {
  static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
  // Truth table of Eq and Lt for Hi and Lo u64.
  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  // =H =L cH cL  | out = cH | (=H & cL)
  //  0  0  0  0  |  0
  //  0  0  0  1  |  0
  //  0  0  1  0  |  1
  //  0  0  1  1  |  1
  //  0  1  0  0  |  0
  //  0  1  0  1  |  0
  //  0  1  1  0  |  1
  //  1  0  0  0  |  0
  //  1  0  0  1  |  1
  //  1  1  0  0  |  0
  const MFromD<D> eqHL = Eq(a, b);
  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
  // comparison result leftwards requires only 4. IfThenElse compiles to the
  // same code as OrAnd().
  const VFromD<D> ltLx = DupEven(ltHL);
  const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL);
  return MaskFromVec(DupOdd(outHx));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
}

// ------------------------------ Eq128

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) {
  static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
}

// ------------------------------ Ne128

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) {
  static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
  return MaskFromVec(Or(Reverse2(d, neHL), neHL));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
  return MaskFromVec(InterleaveUpper(d, neHL, neHL));
}

// ------------------------------ Min128, Max128 (Lt128)

// Without a native OddEven, it seems infeasible to go faster than Lt128.
template <class D>
HWY_INLINE VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128(d, a, b), a, b);
}

template <class D>
HWY_INLINE VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128(d, b, a), a, b);
}

template <class D>
HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128Upper(d, a, b), a, b);
}

template <class D>
HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128Upper(d, b, a), a, b);
}

// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex

#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
#undef HWY_NATIVE_LEADING_ZERO_COUNT
#else
#define HWY_NATIVE_LEADING_ZERO_COUNT
#endif

HWY_NEON_DEF_FUNCTION_INT_8_16_32(LeadingZeroCount, vclz, _, 1)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(LeadingZeroCount, vclz, _, 1)

template <class V, HWY_IF_UI64_D(DFromV<V>)>
HWY_API V LeadingZeroCount(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Repartition<uint32_t, decltype(d)> du32;

  const auto v_k32 = BitCast(du32, Set(du, 32));
  const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32;
  const auto v_u32_lo_lzcnt =
      And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu)));
  const auto v_u32_hi_lzcnt =
      BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt)));

  return BitCast(
      d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt));
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V HighestSetBitIndex(V v) {
  const DFromV<decltype(v)> d;
  using T = TFromD<decltype(d)>;
  return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, 1)>
HWY_API V TrailingZeroCount(V v) {
  return LeadingZeroCount(ReverseBits(v));
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
HWY_API V TrailingZeroCount(V v) {
  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return LeadingZeroCount(
      ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v)))));
}

namespace detail {  // for code folding
#if HWY_ARCH_ARM_V7
#undef vuzp1_s8
#undef vuzp1_u8
#undef vuzp1_s16
#undef vuzp1_u16
#undef vuzp1_s32
#undef vuzp1_u32
#undef vuzp1_f32
#undef vuzp1q_s8
#undef vuzp1q_u8
#undef vuzp1q_s16
#undef vuzp1q_u16
#undef vuzp1q_s32
#undef vuzp1q_u32
#undef vuzp1q_f32
#undef vuzp2_s8
#undef vuzp2_u8
#undef vuzp2_s16
#undef vuzp2_u16
#undef vuzp2_s32
#undef vuzp2_u32
#undef vuzp2_f32
#undef vuzp2q_s8
#undef vuzp2q_u8
#undef vuzp2q_s16
#undef vuzp2q_u16
#undef vuzp2q_s32
#undef vuzp2q_u32
#undef vuzp2q_f32
#undef vzip1_s8
#undef vzip1_u8
#undef vzip1_s16
#undef vzip1_u16
#undef vzip1_s32
#undef vzip1_u32
#undef vzip1_f32
#undef vzip1q_s8
#undef vzip1q_u8
#undef vzip1q_s16
#undef vzip1q_u16
#undef vzip1q_s32
#undef vzip1q_u32
#undef vzip1q_f32
#undef vzip2_s8
#undef vzip2_u8
#undef vzip2_s16
#undef vzip2_u16
#undef vzip2_s32
#undef vzip2_u32
#undef vzip2_f32
#undef vzip2q_s8
#undef vzip2q_u8
#undef vzip2q_s16
#undef vzip2q_u16
#undef vzip2q_s32
#undef vzip2q_u32
#undef vzip2q_f32
#endif

#undef HWY_NEON_BUILD_ARG_1
#undef HWY_NEON_BUILD_ARG_2
#undef HWY_NEON_BUILD_ARG_3
#undef HWY_NEON_BUILD_PARAM_1
#undef HWY_NEON_BUILD_PARAM_2
#undef HWY_NEON_BUILD_PARAM_3
#undef HWY_NEON_BUILD_RET_1
#undef HWY_NEON_BUILD_RET_2
#undef HWY_NEON_BUILD_RET_3
#undef HWY_NEON_BUILD_TPL_1
#undef HWY_NEON_BUILD_TPL_2
#undef HWY_NEON_BUILD_TPL_3
#undef HWY_NEON_DEF_FUNCTION
#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
#undef HWY_NEON_DEF_FUNCTION_FULL_UI
#undef HWY_NEON_DEF_FUNCTION_INT_16
#undef HWY_NEON_DEF_FUNCTION_INT_32
#undef HWY_NEON_DEF_FUNCTION_INT_8
#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
#undef HWY_NEON_DEF_FUNCTION_INTS
#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
#undef HWY_NEON_DEF_FUNCTION_TPL
#undef HWY_NEON_DEF_FUNCTION_UIF81632
#undef HWY_NEON_DEF_FUNCTION_UINT_16
#undef HWY_NEON_DEF_FUNCTION_UINT_32
#undef HWY_NEON_DEF_FUNCTION_UINT_8
#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
#undef HWY_NEON_DEF_FUNCTION_UINTS
#undef HWY_NEON_EVAL
}  // namespace detail

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace hwy
HWY_AFTER_NAMESPACE();