/*************************************************************************** * Copyright (c) Rivos Inc. * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_RVV_HPP #define XSIMD_RVV_HPP #include #include #include #include "../types/xsimd_rvv_register.hpp" #include "xsimd_constants.hpp" // This set of macros allows the synthesis of identifiers using a template and // variable macro arguments. A single template can then be used by multiple // macros, or multiple instances of a macro to define the same logic for // different data types. // // First some logic to paste text together... // #define XSIMD_RVV_JOIN_(x, y) x##y #define XSIMD_RVV_JOIN(x, y) XSIMD_RVV_JOIN_(x, y) #define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_RVV_JOIN(T, then) #define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_RVV_JOIN(S, then) #define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_RVV_JOIN(m1, then) #define XSIMD_RVV_PREFIX(T, S, then) then // // XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for // the identifier. The template is a comma-separated list of alternating // literal and parameter segments. Each parameter is appended to XSIMD_RVV_PREFIX to // form a new macro name which decides which parameter should be inserted. // Then a literal segment is inserted after that. Empty literals are used to // join two or more variables together. // #define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t #define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__))) // // UNBRACKET and REPARSE force the preprocessor to handle expansion in a // specific order. XSIMD_RVV_UNBRACKET strips the parentheses from the template // (which were necessary to keep the template as a single, named macro // parameter up to this point). XSIMD_RVV_ARG_LIST then forms the new parameter list // to pass to XSIMD_RVV_IDENTIFIER0, with trailing commas to ensure the unrolled // XSIMD_RVV_IDENTIFIER loop runs to completion adding empty strings. // // However XSIMD_RVV_IDENTIFIER0 is not expanded immediately because it does not // match a function-like macro in this pass. XSIMD_RVV_REPARSE forces another // evaluation after the expansion of XSIMD_RVV_ARG_LIST, where XSIMD_RVV_IDENTIFIER0 will // now match as a function-like macro, and the cycle of substitutions and // insertions can begin. // #define XSIMD_RVV_REPARSE(v) (v) #define XSIMD_RVV_UNBRACKET(...) __VA_ARGS__ #define XSIMD_RVV_ARG_LIST(T, S, name) (T, S, XSIMD_RVV_UNBRACKET name, , , , , , , , , , , , , , , , , , , , , ) #define XSIMD_RVV_IDENTIFIER(T, S, name) XSIMD_RVV_REPARSE(XSIMD_RVV_IDENTIFIER0 XSIMD_RVV_ARG_LIST(T, S, name)) // // To avoid comma-counting bugs, replace the variable references with macros // which include enough commas to keep proper phase, and then use no commas at // all in the templates. // #define XSIMD_RVV_T , _T, #define XSIMD_RVV_S , _S, #define XSIMD_RVV_M , _M, #define XSIMD_RVV_TSM XSIMD_RVV_T XSIMD_RVV_S XSIMD_RVV_M // XSIMD_RVV_OVERLOAD, below, expands to a head section, a number of body sections // (depending on which types are supported), and a tail section. Different // variants of these sections are implemented with different suffixes on the // three macro names XSIMD_RVV_WRAPPER_HEAD, XSIMD_RVV_WRAPPER, and XSIMD_RVV_WRAPPER_TAIL and // specified as an argument to XSIMD_RVV_OVERLOAD (the empty string is the default, // but still needs an extra comma to hold its place). // // The default XSIMD_RVV_WRAPPER_HEAD provides a class containing convenient names // for the function signature argument(s) to XSIMD_RVV_OVERLOAD. That signature can // also reference the template argument T, because it's a text substitution // into the template. #define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...) \ namespace NAME##_cruft \ { \ template \ struct ctx \ { \ static constexpr size_t width = XSIMD_RVV_BITS; \ static constexpr size_t vl = width / (sizeof(T) * 8); \ using vec = rvv_reg_t; \ using uvec = rvv_reg_t, width>; \ using svec = rvv_reg_t, width>; \ using fvec = rvv_reg_t, width>; \ using bvec = rvv_bool_t; \ using scalar_vec = rvv_reg_t; \ using wide_vec = rvv_reg_t; \ using narrow_vec = rvv_reg_t; \ using type = SIGNATURE; \ }; \ template \ using sig_t = typename ctx::type; \ template \ struct impl \ { \ void operator()() const noexcept {}; \ }; \ template \ using impl_t = impl>; #define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) // The body of the wrapper defines a functor (because partial specialisation of // functions is not legal) which forwards its arguments to the named intrinsic // with a few manipulations. In general, vector types are handled as // rvv_reg_t<> and rely on the conversion operators in that class for // compatibility with the intrinsics. // // The function signature is not mentioned here. Instead it's provided in the // tail code as the template argument for which this is a specialisation, which // overcomes the problem of converting a function signature type to an argument // list to pass to another function. // #define XSIMD_RVV_WRAPPER(KEY, CALLEE, ...) \ template \ struct impl \ { \ using ctx = ctx; \ constexpr Ret operator()(Args... args) const noexcept \ { \ return CALLEE(args..., ctx::vl); \ }; \ }; #define XSIMD_RVV_WRAPPER_NOVL(KEY, CALLEE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(Args... args) const noexcept \ { \ return CALLEE(args...); \ }; \ }; #define XSIMD_RVV_WRAPPER_DROP_1ST(KEY, CALLEE, ...) \ template \ struct impl \ { \ using ctx = ctx; \ constexpr Ret operator()(First, Args... args) const noexcept \ { \ return CALLEE(args..., ctx::vl); \ }; \ }; #define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS(KEY, CALLEE, SIGNATURE, ...) \ template \ struct impl \ { \ using ctx = ctx; \ constexpr Ret operator()(First, Args... args) const noexcept \ { \ return CALLEE(__VA_ARGS__, ctx::vl); \ }; \ }; #define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS_NOVL(KEY, CALLEE, SIGNATURE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(First, Args... args) const noexcept \ { \ return CALLEE(__VA_ARGS__); \ }; \ }; // This part folds all the above templates down into a single functor instance // with all the different function signatures available under the one name. // Not all of the base classes necessarily contain useful code, but there's a // default implementation so that filtering them out isn't really necessary. #define XSIMD_RVV_WRAPPER_TAIL(NAME, ...) \ } /* namespace NAME##_cruft */ \ static constexpr struct : NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t \ { \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ } NAME {}; #define XSIMD_RVV_WRAPPER_TAIL_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) // clang-format off #define XSIMD_RVV_OVERLOAD_head(my_name, variant, ...) \ XSIMD_RVV_WRAPPER_HEAD##variant(my_name, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_i(name, variant, ...) \ XSIMD_RVV_WRAPPER##variant(int8_t, XSIMD_RVV_IDENTIFIER(i, 8, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(int16_t, XSIMD_RVV_IDENTIFIER(i, 16, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(int32_t, XSIMD_RVV_IDENTIFIER(i, 32, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(int64_t, XSIMD_RVV_IDENTIFIER(i, 64, name), __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_u(name, variant, ...) \ XSIMD_RVV_WRAPPER##variant(uint8_t, XSIMD_RVV_IDENTIFIER(u, 8, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(uint16_t, XSIMD_RVV_IDENTIFIER(u, 16, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(uint32_t, XSIMD_RVV_IDENTIFIER(u, 32, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(uint64_t, XSIMD_RVV_IDENTIFIER(u, 64, name), __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_f(name, variant, ...) \ XSIMD_RVV_WRAPPER##variant(float, XSIMD_RVV_IDENTIFIER(f, 32, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(double, XSIMD_RVV_IDENTIFIER(f, 64, name), __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_tail(my_name, variant, ...) \ XSIMD_RVV_WRAPPER_TAIL##variant(my_name, __VA_ARGS__) // Use these to create function (actually functor, sorry) wrappers overloaded // for whichever types are supported. Being functors means they can't take a // template argument (until C++14), so if a type can't be deduced then a junk // value can be passed as the first argument and discarded by using the // _DROP_1ST variant, instead. // // The wrappers use the rvv_reg_t<> types for template accessibility, and // because some types (eg., vfloat64mf2_t) don't exist and need extra // abstraction to emulate. // // In many cases the intrinsic names are different for signed, unsigned, or // float variants, the macros OVERLOAD2 and OVERLOAD3 (depending on whether or // not a float variant exists) take multiple intrinsic names and bring them // together under a single overloaded identifier where they can be used within // templates. // #define XSIMD_RVV_OVERLOAD2(my_name, name_i, name_u, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD3(my_name, name_i, name_u, name_f, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_f(name_f, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD(my_name, name, ...) XSIMD_RVV_OVERLOAD3(my_name, name, name, name, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_INTS(my_name, name, ...) XSIMD_RVV_OVERLOAD2(my_name, name, name, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_SINTS(my_name, name, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_i(name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_UINTS(my_name, name, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_u(name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_FLOATS(my_name, name, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_f(name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) // clang-format on namespace xsimd { template struct batch_constant; namespace kernel { namespace detail { template using rvv_fix_char_t = types::detail::rvv_fix_char_t; template using rvv_reg_t = types::detail::rvv_reg_t; template using rvv_bool_t = types::detail::rvv_bool_t; template struct as_signed_relaxed; template <> struct as_signed_relaxed<1> { using type = int8_t; }; template <> struct as_signed_relaxed<2> { using type = int16_t; }; template <> struct as_signed_relaxed<4> { using type = int32_t; }; template <> struct as_signed_relaxed<8> { using type = int64_t; }; template using as_signed_relaxed_t = typename as_signed_relaxed::type; template struct as_unsigned_relaxed; template <> struct as_unsigned_relaxed<1> { using type = uint8_t; }; template <> struct as_unsigned_relaxed<2> { using type = uint16_t; }; template <> struct as_unsigned_relaxed<4> { using type = uint32_t; }; template <> struct as_unsigned_relaxed<8> { using type = uint64_t; }; template using as_unsigned_relaxed_t = typename as_unsigned_relaxed::type; template struct as_float_relaxed; template <> struct as_float_relaxed<1> { using type = int8_t; }; template <> struct as_float_relaxed<2> { using type = int16_t; }; template <> struct as_float_relaxed<4> { using type = float; }; template <> struct as_float_relaxed<8> { using type = double; }; template using as_float_relaxed_t = typename as_float_relaxed::type; template rvv_reg_t rvvreinterpret(U const& arg) noexcept { return rvv_reg_t(arg, types::detail::XSIMD_RVV_BITCAST); } template rvv_reg_t rvvreinterpret(batch const& arg) noexcept { typename batch::register_type r = arg; return rvvreinterpret(r); } template > inline batch rvv_to_unsigned_batch(batch const& arg) noexcept { return rvvreinterpret(arg.data); } XSIMD_RVV_OVERLOAD(rvvid, (__riscv_vid_v_u XSIMD_RVV_S XSIMD_RVV_M), _DROP_1ST, uvec(T)) XSIMD_RVV_OVERLOAD3(rvvmv_splat, (__riscv_vmv_v_x_ XSIMD_RVV_TSM), (__riscv_vmv_v_x_ XSIMD_RVV_TSM), (__riscv_vfmv_v_f_ XSIMD_RVV_TSM), , vec(T)) XSIMD_RVV_OVERLOAD3(rvvmv_lane0, (__riscv_vmv_x), (__riscv_vmv_x), (__riscv_vfmv_f), _NOVL, T(vec)) XSIMD_RVV_OVERLOAD(rvvmerge, (__riscv_vmerge), , vec(vec, vec, bvec)) XSIMD_RVV_OVERLOAD3(rvvmerge_splat, (__riscv_vmerge), (__riscv_vmerge), (__riscv_vfmerge), , vec(vec, T, bvec)) // count active lanes in a predicate XSIMD_RVV_OVERLOAD(rvvcpop, (__riscv_vcpop), , size_t(bvec)); template inline rvv_bool_t pmask8(uint8_t mask) noexcept { return rvv_bool_t(mask); } template inline rvv_bool_t pmask(uint64_t mask) noexcept { return rvv_bool_t(mask); } template inline rvv_reg_t vindex() noexcept { auto index = rvvid(T {}); if (shift < 0) index = __riscv_vsrl(index, -shift, batch::size); else index = __riscv_vsll(index, shift, batch::size); return __riscv_vadd(index, T(offset), batch::size); } // enable for signed integers template using rvv_enable_signed_int_t = typename std::enable_if::value && std::is_signed::value, int>::type; // enable for unsigned integers template using rvv_enable_unsigned_int_t = typename std::enable_if::value && std::is_unsigned::value, int>::type; // enable for floating points template using rvv_enable_floating_point_t = typename std::enable_if::value, int>::type; // enable for signed integers or floating points template using rvv_enable_signed_int_or_floating_point_t = typename std::enable_if::value, int>::type; // enable for all RVE supported types template using rvv_enable_all_t = typename std::enable_if::value, int>::type; } // namespace detail /******************** * Scalar to vector * ********************/ namespace detail { template inline detail::rvv_reg_t broadcast(T arg) noexcept { // A bit of a dance, here, because rvvmv_splat has no other // argument from which to deduce type, and T=char is not // supported. detail::rvv_fix_char_t arg_not_char(arg); const auto splat = detail::rvvmv_splat(arg_not_char); return detail::rvv_reg_t(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST); } } // broadcast template inline batch broadcast(T arg, requires_arch) noexcept { return detail::broadcast(arg); } /********* * Load * *********/ namespace detail { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) } template = 0> inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return detail::rvvle(reinterpret_cast const*>(src)); } template = 0> inline batch load_unaligned(T const* src, convert, requires_arch) noexcept { return load_aligned(src, convert(), rvv {}); } // load_complex namespace detail { template = types::detail::rvv_width_m1, int>::type = 0> inline rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept { typename rvv_reg_t::register_type tmp; tmp = __riscv_vset(tmp, 0, lo); return __riscv_vset(tmp, 1, hi); } template ::type = 0> inline rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept { return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2); } XSIMD_RVV_OVERLOAD(rvvget_lo_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 0) XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1) template = types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = rvvget_lo_(T {}, vv); return tmp; } template = types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = rvvget_hi_(T {}, vv); return tmp; } template ::type = 0> rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = vv; return tmp; } template ::type = 0> rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept { return __riscv_vslidedown(vv, vv.vl / 2, vv.vl); } template = 0> inline batch, A> load_complex(batch const& lo, batch const& hi, requires_arch) noexcept { const auto real_index = vindex, 0, 1>(); const auto imag_index = vindex, 1, 1>(); const auto index = rvvabut, A::width>(real_index, imag_index); const auto input = rvvabut(lo.data, hi.data); const rvv_reg_t result = __riscv_vrgather(input, index, index.vl); return { rvvget_lo(result), rvvget_hi(result) }; } } /********* * Store * *********/ template = 0> inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { detail::rvvse(reinterpret_cast*>(dst), src); } template = 0> inline void store_unaligned(T* dst, batch const& src, requires_arch) noexcept { store_aligned (dst, src, rvv {}); } /****************** * scatter/gather * ******************/ namespace detail { template using rvv_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type; XSIMD_RVV_OVERLOAD(rvvloxei, (__riscv_vloxei XSIMD_RVV_S), , vec(T const*, uvec)) XSIMD_RVV_OVERLOAD(rvvsoxei, (__riscv_vsoxei XSIMD_RVV_S), , void(T*, uvec, vec)) XSIMD_RVV_OVERLOAD3(rvvmul_splat, (__riscv_vmul), (__riscv_vmul), (__riscv_vfmul), , vec(vec, T)) } // scatter template = 0> inline void scatter(batch const& vals, T* dst, batch const& index, kernel::requires_arch) noexcept { using UU = as_unsigned_integer_t; const auto uindex = detail::rvv_to_unsigned_batch(index); auto* base = reinterpret_cast*>(dst); // or rvvsuxei const auto bi = detail::rvvmul_splat(uindex, sizeof(T)); detail::rvvsoxei(base, bi, vals); } // gather template = 0> inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { using UU = as_unsigned_integer_t; const auto uindex = detail::rvv_to_unsigned_batch(index); auto const* base = reinterpret_cast const*>(src); // or rvvluxei const auto bi = detail::rvvmul_splat(uindex, sizeof(T)); return detail::rvvloxei(base, bi); } /************** * Arithmetic * **************/ namespace detail { XSIMD_RVV_OVERLOAD3(rvvadd, (__riscv_vadd), (__riscv_vadd), (__riscv_vfadd), , vec(vec, vec)) XSIMD_RVV_OVERLOAD2(rvvsadd, (__riscv_vsadd), (__riscv_vsaddu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvsub, (__riscv_vsub), (__riscv_vsub), (__riscv_vfsub), , vec(vec, vec)) XSIMD_RVV_OVERLOAD2(rvvssub, (__riscv_vssub), (__riscv_vssubu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD2(rvvaadd, (__riscv_vaadd), (__riscv_vaaddu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmul, (__riscv_vmul), (__riscv_vmul), (__riscv_vfmul), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvdiv, (__riscv_vdiv), (__riscv_vdivu), (__riscv_vfdiv), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmax, (__riscv_vmax), (__riscv_vmaxu), (__riscv_vfmax), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmin, (__riscv_vmin), (__riscv_vminu), (__riscv_vfmin), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvneg, (__riscv_vneg), (abort), (__riscv_vfneg), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvabs, (__riscv_vfabs), , vec(vec)) XSIMD_RVV_OVERLOAD3(rvvmacc, (__riscv_vmacc), (__riscv_vmacc), (__riscv_vfmacc), , vec(vec, vec, vec)) XSIMD_RVV_OVERLOAD3(rvvnmsac, (__riscv_vnmsac), (__riscv_vnmsac), (__riscv_vfnmsac), , vec(vec, vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmadd, (__riscv_vmadd), (__riscv_vmadd), (__riscv_vfmadd), , vec(vec, vec, vec)) XSIMD_RVV_OVERLOAD3(rvvnmsub, (__riscv_vnmsub), (__riscv_vnmsub), (__riscv_vfnmsub), , vec(vec, vec, vec)) #define RISCV_VMSXX(XX) \ XSIMD_RVV_OVERLOAD3(rvvms##XX, \ (__riscv_vms##XX), \ (__riscv_vms##XX##u), \ (__riscv_vmf##XX), , bvec(vec, vec)) \ XSIMD_RVV_OVERLOAD3(rvvms##XX##_splat, \ (__riscv_vms##XX), \ (__riscv_vms##XX##u), \ (__riscv_vmf##XX), , bvec(vec, T)) #define __riscv_vmsequ __riscv_vmseq #define __riscv_vmsneu __riscv_vmsne RISCV_VMSXX(eq) RISCV_VMSXX(ne) RISCV_VMSXX(lt) RISCV_VMSXX(le) RISCV_VMSXX(gt) RISCV_VMSXX(ge) #undef __riscv_vmsequ #undef __riscv_vmsneu #undef RISCV_VMSXX } // namespace detail // add template = 0> inline batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvadd(lhs, rhs); } // sadd template = 0> inline batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvsadd(lhs, rhs); } // sub template = 0> inline batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvsub(lhs, rhs); } // ssub template = 0> inline batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvssub(lhs, rhs); } // mul template = 0> inline batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmul(lhs, rhs); } // div template = 0> inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvdiv(lhs, rhs); } // max template = 0> inline batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmax(lhs, rhs); } // min template = 0> inline batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmin(lhs, rhs); } // neg template = 0> inline batch neg(batch const& arg, requires_arch) noexcept { using S = as_signed_integer_t; const auto as_signed = detail::rvvreinterpret(arg); const auto result = detail::rvvneg(as_signed); return detail::rvvreinterpret(result); } template = 0> inline batch neg(batch const& arg, requires_arch) noexcept { return detail::rvvneg(arg); } // abs template = 0> inline batch abs(batch const& arg, requires_arch) noexcept { return arg; } template = 0> inline batch abs(batch const& arg, requires_arch) noexcept { return detail::rvvabs(arg); } // fma: x * y + z template = 0> inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also detail::rvvmadd(x, y, z); return detail::rvvmacc(z, x, y); } // fnma: z - x * y template = 0> inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also detail::rvvnmsub(x, y, z); return detail::rvvnmsac(z, x, y); } // fms: x * y - z template = 0> inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also vfmsac(z, x, y), but lacking integer version // also vfmsub(x, y, z), but lacking integer version return -fnma(x, y, z); } // fnms: - x * y - z template = 0> inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also vfnmacc(z, x, y), but lacking integer version // also vfnmadd(x, y, z), but lacking integer version return -fma(z, x, y); } /********************** * Logical operations * **********************/ namespace detail { XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvor_splat, (__riscv_vor), , vec(vec, T)) XSIMD_RVV_OVERLOAD_INTS(rvvxor, (__riscv_vxor), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvnot, (__riscv_vnot), , vec(vec)) XSIMD_RVV_OVERLOAD(rvvmand, (__riscv_vmand_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmor, (__riscv_vmor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmxor, (__riscv_vmxor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmandn, (__riscv_vmandn_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmnot, (__riscv_vmnot), , bvec(bvec)) } // bitwise_and template = 0> inline batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvand(lhs, rhs); } template = 0> inline batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); const auto result_bits = detail::rvvand(lhs_bits, rhs_bits); return detail::rvvreinterpret(result_bits); } template = 0> inline batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail::rvvmand(lhs, rhs); } // bitwise_andnot template = 0> inline batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto not_rhs = detail::rvvnot(rhs); return detail::rvvand(lhs, not_rhs); } template = 0> inline batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); const auto not_rhs = detail::rvvnot(rhs_bits); const auto result_bits = detail::rvvand(lhs_bits, not_rhs); return detail::rvvreinterpret(result_bits); } template = 0> inline batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail::rvvmandn(lhs, rhs); } // bitwise_or template = 0> inline batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvor(lhs, rhs); } template = 0> inline batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); const auto result_bits = detail::rvvor(lhs_bits, rhs_bits); return detail::rvvreinterpret(result_bits); } template = 0> inline batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail::rvvmor(lhs, rhs); } // bitwise_xor template = 0> inline batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvxor(lhs, rhs); } template = 0> inline batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits); return detail::rvvreinterpret(result_bits); } template = 0> inline batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail::rvvmxor(lhs, rhs); } // bitwise_not template = 0> inline batch bitwise_not(batch const& arg, requires_arch) noexcept { return detail::rvvnot(arg); } template = 0> inline batch bitwise_not(batch const& arg, requires_arch) noexcept { const auto arg_bits = detail::rvv_to_unsigned_batch(arg); const auto result_bits = detail::rvvnot(arg_bits); return detail::rvvreinterpret(result_bits); } template = 0> inline batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { return detail::rvvmnot(arg); } /********** * Shifts * **********/ namespace detail { XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t)) XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec)) XSIMD_RVV_OVERLOAD2(rvvsr_splat, (__riscv_vsra), (__riscv_vsrl), , vec(vec, size_t)) XSIMD_RVV_OVERLOAD2(rvvsr, (__riscv_vsra), (__riscv_vsrl), , vec(vec, uvec)) } // namespace detail // bitwise_lshift template = 0> inline batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept { constexpr size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return detail::rvvsll_splat(arg, n); } template = 0> inline batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch(rhs)); } // bitwise_rshift template = 0> inline batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept { constexpr size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return detail::rvvsr_splat(arg, n); } template = 0> inline batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch(rhs)); } /************** * Reductions * **************/ namespace detail { XSIMD_RVV_OVERLOAD3(rvvredsum, (__riscv_vredsum), (__riscv_vredsum), (__riscv_vfredosum), // or __riscv_vfredusum , scalar_vec(vec, scalar_vec)) XSIMD_RVV_OVERLOAD3(rvvredmax, (__riscv_vredmax), (__riscv_vredmaxu), (__riscv_vfredmax), , scalar_vec(vec, scalar_vec)) XSIMD_RVV_OVERLOAD3(rvvredmin, (__riscv_vredmin), (__riscv_vredminu), (__riscv_vfredmin), , scalar_vec(vec, scalar_vec)) XSIMD_RVV_OVERLOAD3(rvvslide1up, (__riscv_vslide1up), (__riscv_vslide1up), (__riscv_vfslide1up), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvslide1down, (__riscv_vslide1down), (__riscv_vslide1down), (__riscv_vfslide1down), , vec(vec, T)) template inline T reduce_scalar(rvv_reg_t const& arg) { return detail::rvvmv_lane0(rvv_reg_t(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST)); } } // reduce_add template ::value_type, detail::rvv_enable_all_t = 0> inline V reduce_add(batch const& arg, requires_arch) noexcept { const auto zero = detail::broadcast(T(0)); const auto r = detail::rvvredsum(arg, zero); return detail::reduce_scalar(r); } // reduce_max template = 0> inline T reduce_max(batch const& arg, requires_arch) noexcept { const auto lowest = detail::broadcast(std::numeric_limits::lowest()); const auto r = detail::rvvredmax(arg, lowest); return detail::reduce_scalar(r); } // reduce_min template = 0> inline T reduce_min(batch const& arg, requires_arch) noexcept { const auto max = detail::broadcast(std::numeric_limits::max()); const auto r = detail::rvvredmin(arg, max); return detail::reduce_scalar(r); } // haddp template = 0> inline batch haddp(const batch* row, requires_arch) noexcept { constexpr std::size_t size = batch::size; T sums[size]; #pragma unroll size for (std::size_t i = 0; i < size; ++i) { sums[i] = reduce_add(row[i], rvv {}); } return load_aligned(sums, convert(), rvv {}); } /*************** * Comparisons * ***************/ // eq template = 0> inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmseq(lhs, rhs); } template = 0> inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { const auto neq_result = detail::rvvmxor(lhs, rhs); return detail::rvvmnot(neq_result); } // neq template = 0> inline batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmsne(lhs, rhs); } template = 0> inline batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail::rvvmxor(lhs, rhs); } // lt template = 0> inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmslt(lhs, rhs); } // le template = 0> inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmsle(lhs, rhs); } // gt template = 0> inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmsgt(lhs, rhs); } // ge template = 0> inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail::rvvmsge(lhs, rhs); } /************* * Selection * *************/ namespace detail { XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress), , vec(vec, bvec)) } // compress template inline batch compress(batch const& x, batch_bool const& mask, requires_arch) noexcept { return detail::rvvcompress(x, mask); } /*************** * Permutation * ***************/ namespace detail { XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec)) XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t)) XSIMD_RVV_OVERLOAD(rvvslidedown, (__riscv_vslidedown), , vec(vec, size_t)) } // swizzle template inline batch swizzle(batch const& arg, batch_constant, requires_arch) noexcept { static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); const batch indices { idx... }; return detail::rvvrgather(arg, indices); } template inline batch, A> swizzle(batch, A> const& self, batch_constant, requires_arch) noexcept { const auto real = swizzle(self.real(), batch_constant {}, rvv {}); const auto imag = swizzle(self.imag(), batch_constant {}, rvv {}); return batch>(real, imag); } /************* * Selection * *************/ // extract_pair template = 0> inline batch extract_pair(batch const& lhs, batch const& rhs, size_t n, requires_arch) noexcept { const auto tmp = detail::rvvslidedown(rhs, n); return detail::rvvslideup(tmp, lhs, lhs.size - n); } // select template = 0> inline batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { return detail::rvvmerge(b, a, cond); } template inline batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { b... }, true_br, false_br, rvv {}); } // zip_lo template = 0> inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto index = detail::vindex, 0, -1>(); const auto mask = detail::pmask8(0xaa); return detail::rvvmerge(detail::rvvrgather(lhs, index), detail::rvvrgather(rhs, index), mask); } // zip_hi template = 0> inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto index = detail::vindex, batch::size / 2, -1>(); const auto mask = detail::pmask8(0xaa); return detail::rvvmerge(detail::rvvrgather(lhs, index), detail::rvvrgather(rhs, index), mask); } // store_complex template = 0> inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { const auto lo = zip_lo(src.real(), src.imag()); const auto hi = zip_hi(src.real(), src.imag()); T* buf = reinterpret_cast(dst); store_aligned(buf, lo, rvv {}); store_aligned(buf + lo.size, hi, rvv {}); } template = 0> inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, rvv {}); } /***************************** * Floating-point arithmetic * *****************************/ namespace detail { XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvfrsqrt7, (__riscv_vfrsqrt7), , vec(vec)) } // rsqrt template = 0> inline batch rsqrt(batch const& arg, requires_arch) noexcept { auto approx = detail::rvvfrsqrt7(arg); approx = approx * (1.5 - (0.5 * arg * approx * approx)); return approx; } // sqrt template = 0> inline batch sqrt(batch const& arg, requires_arch) noexcept { return detail::rvvfsqrt(arg); } // reciprocal template = 0> inline batch reciprocal(const batch& arg, requires_arch) noexcept { return detail::rvvfrec7(arg); } /****************************** * Floating-point conversions * ******************************/ // fast_cast namespace detail { XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C. (__riscv_vfcvt_rtz_x), (__riscv_vfcvt_rtz_xu), _DROP_1ST, vec(T, fvec)) XSIMD_RVV_OVERLOAD2(rvvfcvt_rne, // round to nearest, ties to even (__riscv_vfcvt_x), (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RNE) XSIMD_RVV_OVERLOAD2(rvvfcvt_rmm, // round to nearest, ties to max magnitude (__riscv_vfcvt_x), (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RMM) XSIMD_RVV_OVERLOAD2(rvvfcvt, // round to current rounding mode. (__riscv_vfcvt_x), (__riscv_vfcvt_xu), _DROP_1ST, vec(T, fvec)) XSIMD_RVV_OVERLOAD_INTS(rvvfcvt_f, (__riscv_vfcvt_f), , fvec(vec)) template using rvv_enable_ftoi_t = typename std::enable_if<(sizeof(T) == sizeof(U) && std::is_floating_point::value && !std::is_floating_point::value), int>::type; template using rvv_enable_itof_t = typename std::enable_if<(sizeof(T) == sizeof(U) && !std::is_floating_point::value && std::is_floating_point::value), int>::type; template = 0> inline batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return rvvfcvt_rtz(U {}, arg); } template = 0> inline batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return rvvfcvt_f(arg); } } /********* * Miscs * *********/ // set template inline batch set(batch const&, requires_arch, Args... args) noexcept { const std::array::size> tmp { args... }; return load_unaligned(tmp.data(), convert(), rvv {}); } template inline batch, A> set(batch, A> const&, requires_arch, Args... args_complex) noexcept { return batch>(set(batch {}, rvv {}, args_complex.real()...), set(batch {}, rvv {}, args_complex.imag()...)); } template inline batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using U = as_unsigned_integer_t; const auto values = set(batch {}, rvv {}, static_cast(args)...); const auto zero = broadcast(U(0), rvv {}); detail::rvv_bool_t result = detail::rvvmsne(values, zero); return result; } // insert template = 0> inline batch insert(batch const& arg, T val, index, requires_arch) noexcept { const auto mask = detail::pmask(uint64_t(1) << I); return detail::rvvmerge_splat(arg, val, mask); } // get template = 0> inline T get(batch const& arg, size_t i, requires_arch) noexcept { const auto tmp = detail::rvvslidedown(arg, i); return detail::rvvmv_lane0(tmp); } template = 0> inline std::complex get(batch, A> const& arg, size_t i, requires_arch) noexcept { const auto tmpr = detail::rvvslidedown(arg.real(), i); const auto tmpi = detail::rvvslidedown(arg.imag(), i); return std::complex { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) }; } // all template = 0> inline bool all(batch_bool const& arg, requires_arch) noexcept { return detail::rvvcpop(arg) == batch_bool::size; } // any template = 0> inline bool any(batch_bool const& arg, requires_arch) noexcept { return detail::rvvcpop(arg) > 0; } // bitwise_cast template = 0, detail::rvv_enable_all_t = 0> inline batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return detail::rvv_reg_t(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST); } // batch_bool_cast template = 0> inline batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept { using intermediate_t = typename detail::rvv_bool_t; return intermediate_t(arg.data); } // from_bool template = 0> inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { const auto zero = broadcast(T(0), rvv {}); return detail::rvvmerge_splat(zero, T(1), arg); } namespace detail { template inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8); } template <> inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { const auto bytes = __riscv_vlmul_trunc_u8mf2(arg); const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8); return __riscv_vlmul_ext_u8m1(result); } template <> inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { const auto bytes = __riscv_vlmul_trunc_u8mf4(arg); const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8); return __riscv_vlmul_ext_u8m1(result); } template <> inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { const auto bytes = __riscv_vlmul_trunc_u8mf8(arg); const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8); return __riscv_vlmul_ext_u8m1(result); } } // slide_left template = 0> inline batch slide_left(batch const& arg, requires_arch) noexcept { const auto zero = broadcast (uint8_t(0), rvv {}); const auto bytes = arg.data.get_bytes(); return detail::rvvreinterpret(detail::rvvslideup(zero, bytes, N)); } // slide_right template = 0> inline batch slide_right(batch const& arg, requires_arch) noexcept { using reg_t = detail::rvv_reg_t; const auto bytes = arg.data.get_bytes(); return reg_t(detail::rvvslidedownbytes(bytes, N), types::detail::XSIMD_RVV_BITCAST); } // isnan template = 0> inline batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } namespace detail { template using rvv_as_signed_integer_t = as_signed_integer_t>; template > inline batch rvvfcvt_default(batch const& arg) noexcept { return rvvfcvt_rne(U {}, arg); } template > inline batch rvvfcvt_afz(batch const& arg) noexcept { return rvvfcvt_rmm(U {}, arg); } } // nearbyint_as_int template > inline batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { // Reference rounds ties to nearest even return detail::rvvfcvt_default(arg); } // round template = 0> inline batch round(batch const& arg, requires_arch) noexcept { // Round ties away from zero. const auto mask = abs(arg) < constants::maxflint>(); return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {}); } // nearbyint template = 0> inline batch nearbyint(batch const& arg, requires_arch) noexcept { // Round according to current rounding mode. const auto mask = abs(arg) < constants::maxflint>(); return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {}); } } // namespace kernel } // namespace xsimd #endif