From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:14:29 +0200 Subject: Merging upstream version 125.0.1. Signed-off-by: Daniel Baumann --- third_party/gemmology/gemmology.h | 95 ++++++++++++++-------- .../kernels/GemmologyEngineNeon64I8mm.cpp | 19 +++++ third_party/gemmology/moz.yaml | 4 +- 3 files changed, 84 insertions(+), 34 deletions(-) create mode 100644 third_party/gemmology/kernels/GemmologyEngineNeon64I8mm.cpp (limited to 'third_party/gemmology') diff --git a/third_party/gemmology/gemmology.h b/third_party/gemmology/gemmology.h index d774c53388..eb5ebed3b4 100644 --- a/third_party/gemmology/gemmology.h +++ b/third_party/gemmology/gemmology.h @@ -198,6 +198,17 @@ PermuteSummer(xsimd::batch pack0123, return _mm256_add_epi32(rev, blended); } +template +inline xsimd::batch Pack0123(xsimd::batch sum0, + xsimd::batch sum1, + xsimd::batch sum2, + xsimd::batch sum3, + xsimd::kernel::requires_arch) { + auto pack01 = _mm256_hadd_epi32(sum0, sum1); + auto pack23 = _mm256_hadd_epi32(sum2, sum3); + return _mm256_hadd_epi32(pack01, pack23); +} + #ifdef __AVXVNNI__ template @@ -245,6 +256,17 @@ madd(xsimd::batch x, xsimd::batch y, xsimd::kernel::requires_arch) { return _mm_maddubs_epi16(xsimd::abs(x), _mm_sign_epi8(y, x)); } + +template +inline xsimd::batch Pack0123(xsimd::batch sum0, + xsimd::batch sum1, + xsimd::batch sum2, + xsimd::batch sum3, + xsimd::kernel::requires_arch) { + auto pack01 = _mm_hadd_epi32(sum0, sum1); + auto pack23 = _mm_hadd_epi32(sum2, sum3); + return _mm_hadd_epi32(pack01, pack23); +} #endif #ifdef __SSE2__ @@ -524,7 +546,8 @@ xsimd::batch deinterleave(xsimd::batch first, xsimd::batch second, xsimd::kernel::requires_arch) { - return vcombine_s8(vqmovn_s16(first), vqmovn_s16(second)); + + return vqmovn_high_s16(vqmovn_s16(first), second); } template @@ -532,27 +555,18 @@ xsimd::batch deinterleave(xsimd::batch first, xsimd::batch second, xsimd::kernel::requires_arch) { - return vcombine_s16(vqmovn_s32(first), vqmovn_s32(second)); + return vqmovn_high_s32(vqmovn_s32(first), second); } +#ifdef __ARM_FEATURE_MATMUL_INT8 template inline xsimd::batch -madd(xsimd::batch x, xsimd::batch y, - xsimd::kernel::requires_arch) { - int32x4_t low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); - return vmlal_high_s16(low, x, y); -} - -template -inline xsimd::batch -madd(xsimd::batch x, xsimd::batch y, - xsimd::kernel::requires_arch) { - - int16x8_t tl = vmull_s8(vreinterpret_s8_u8(vget_low_u8(x)), - vget_low_s8(y)); - int16x8_t th = vmull_high_s8(vreinterpretq_s8_u8(x), y); - return vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)); +maddw(xsimd::batch x, xsimd::batch y, + xsimd::batch z, + xsimd::kernel::requires_arch>) { + return vusdotq_s32(z, x, y); } +#endif template inline xsimd::batch @@ -564,15 +578,17 @@ maddw(xsimd::batch x, xsimd::batch y, int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(x))), vmovl_s8(vget_high_s8(y))); return vpadalq_s16(vpadalq_s16(z, tl), th); - //TODO: investigate using vdotq_s32 } template -inline xsimd::batch -madd(xsimd::batch x, xsimd::batch y, - xsimd::kernel::requires_arch) { - int16x8_t low = vmull_s8(vget_low_s8(x), vget_low_s8(y)); - return vmlal_high_s8(low, x, y); +inline xsimd::batch Pack0123(xsimd::batch sum0, + xsimd::batch sum1, + xsimd::batch sum2, + xsimd::batch sum3, + xsimd::kernel::requires_arch) { + auto pack01 = vpaddq_s32(sum0, sum1); + auto pack23 = vpaddq_s32(sum2, sum3); + return vpaddq_s32(pack01, pack23); } #endif @@ -644,20 +660,35 @@ inline auto PermuteSummer(xsimd::batch pack0123, return kernel::PermuteSummer(pack0123, pack4567, Arch{}); } + +namespace kernel { + + template + inline xsimd::batch Pack0123(xsimd::batch sum0, + xsimd::batch sum1, + xsimd::batch sum2, + xsimd::batch sum3, + xsimd::kernel::requires_arch) { + + std::tie(sum0, sum1) = interleave(sum0, sum1, Arch{}); + auto pack01 = sum0 + sum1; + std::tie(sum2, sum3) = interleave(sum2, sum3, Arch{}); + auto pack23 = sum2 + sum3; + + auto packed = interleave(xsimd::bitwise_cast(pack01), + xsimd::bitwise_cast(pack23), + Arch{}); + return xsimd::bitwise_cast(std::get<0>(packed)) + + xsimd::bitwise_cast(std::get<1>(packed)); + } +} + template inline xsimd::batch Pack0123(xsimd::batch sum0, xsimd::batch sum1, xsimd::batch sum2, xsimd::batch sum3) { - std::tie(sum0, sum1) = interleave(sum0, sum1); - auto pack01 = sum0 + sum1; - std::tie(sum2, sum3) = interleave(sum2, sum3); - auto pack23 = sum2 + sum3; - - auto packed = interleave(xsimd::bitwise_cast(pack01), - xsimd::bitwise_cast(pack23)); - return xsimd::bitwise_cast(std::get<0>(packed)) + - xsimd::bitwise_cast(std::get<1>(packed)); + return kernel::Pack0123(sum0, sum1, sum2, sum3, Arch{}); } template diff --git a/third_party/gemmology/kernels/GemmologyEngineNeon64I8mm.cpp b/third_party/gemmology/kernels/GemmologyEngineNeon64I8mm.cpp new file mode 100644 index 0000000000..d8259e750f --- /dev/null +++ b/third_party/gemmology/kernels/GemmologyEngineNeon64I8mm.cpp @@ -0,0 +1,19 @@ +/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* this source code form is subject to the terms of the mozilla public + * license, v. 2.0. if a copy of the mpl was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include + +namespace gemmology { +template struct Engine>; +template void Engine>::SelectColumnsB(int8_t const*, int8_t*, + size_t, uint32_t const*, + uint32_t const*); +template void Engine>::Shift::Multiply( + uint8_t const*, int8_t const*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine>::Shift::PrepareBias( + int8_t const*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +} // namespace gemmology diff --git a/third_party/gemmology/moz.yaml b/third_party/gemmology/moz.yaml index d9f9472da7..749227e2ee 100644 --- a/third_party/gemmology/moz.yaml +++ b/third_party/gemmology/moz.yaml @@ -10,8 +10,8 @@ origin: url: https://github.com/mozilla/gemmology - release: ec535e87d0ab9d1457ff6d2af247cc8113e74694 (2024-02-05T09:05:20Z). - revision: ec535e87d0ab9d1457ff6d2af247cc8113e74694 + release: dbcd029c3bc6e183355ea597216d379677ff9b19 (2024-02-20T12:36:14Z). + revision: dbcd029c3bc6e183355ea597216d379677ff9b19 license: MIT -- cgit v1.2.3