// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Fast SIMD floating-point (I)DCT, any power of two. #if defined(LIB_JXL_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) #ifdef LIB_JXL_DCT_INL_H_ #undef LIB_JXL_DCT_INL_H_ #else #define LIB_JXL_DCT_INL_H_ #endif #include #include #include "lib/jxl/dct_block-inl.h" #include "lib/jxl/dct_scales.h" #include "lib/jxl/transpose-inl.h" HWY_BEFORE_NAMESPACE(); namespace jxl { namespace HWY_NAMESPACE { namespace { // These templates are not found via ADL. using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::NegMulAdd; using hwy::HWY_NAMESPACE::Sub; template struct FVImpl { using type = HWY_CAPPED(float, SZ); }; template <> struct FVImpl<0> { using type = HWY_FULL(float); }; template using FV = typename FVImpl::type; // Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III // Algorithms, by Siriani M. Perera and Jianhua Liu. template struct CoeffBundle { static void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2, float* JXL_RESTRICT aout) { for (size_t i = 0; i < N; i++) { auto in1 = Load(FV(), ain1 + i * SZ); auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); Store(Add(in1, in2), FV(), aout + i * SZ); } } static void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2, float* JXL_RESTRICT aout) { for (size_t i = 0; i < N; i++) { auto in1 = Load(FV(), ain1 + i * SZ); auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); Store(Sub(in1, in2), FV(), aout + i * SZ); } } static void B(float* JXL_RESTRICT coeff) { auto sqrt2 = Set(FV(), kSqrt2); auto in1 = Load(FV(), coeff); auto in2 = Load(FV(), coeff + SZ); Store(MulAdd(in1, sqrt2, in2), FV(), coeff); for (size_t i = 1; i + 1 < N; i++) { auto in1 = Load(FV(), coeff + i * SZ); auto in2 = Load(FV(), coeff + (i + 1) * SZ); Store(Add(in1, in2), FV(), coeff + i * SZ); } } static void BTranspose(float* JXL_RESTRICT coeff) { for (size_t i = N - 1; i > 0; i--) { auto in1 = Load(FV(), coeff + i * SZ); auto in2 = Load(FV(), coeff + (i - 1) * SZ); Store(Add(in1, in2), FV(), coeff + i * SZ); } auto sqrt2 = Set(FV(), kSqrt2); auto in1 = Load(FV(), coeff); Store(Mul(in1, sqrt2), FV(), coeff); } // Ideally optimized away by compiler (except the multiply). static void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) { for (size_t i = 0; i < N / 2; i++) { auto in1 = Load(FV(), ain + i * SZ); Store(in1, FV(), aout + 2 * i * SZ); } for (size_t i = N / 2; i < N; i++) { auto in1 = Load(FV(), ain + i * SZ); Store(in1, FV(), aout + (2 * (i - N / 2) + 1) * SZ); } } // Ideally optimized away by compiler. static void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride, float* JXL_RESTRICT aout) { for (size_t i = 0; i < N / 2; i++) { auto in1 = LoadU(FV(), ain + 2 * i * ain_stride); Store(in1, FV(), aout + i * SZ); } for (size_t i = N / 2; i < N; i++) { auto in1 = LoadU(FV(), ain + (2 * (i - N / 2) + 1) * ain_stride); Store(in1, FV(), aout + i * SZ); } } // Invoked on full vector. static void Multiply(float* JXL_RESTRICT coeff) { for (size_t i = 0; i < N / 2; i++) { auto in1 = Load(FV(), coeff + (N / 2 + i) * SZ); auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); Store(Mul(in1, mul), FV(), coeff + (N / 2 + i) * SZ); } } static void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out, size_t out_stride) { for (size_t i = 0; i < N / 2; i++) { auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); auto in1 = Load(FV(), coeff + i * SZ); auto in2 = Load(FV(), coeff + (N / 2 + i) * SZ); auto out1 = MulAdd(mul, in2, in1); auto out2 = NegMulAdd(mul, in2, in1); StoreU(out1, FV(), out + i * out_stride); StoreU(out2, FV(), out + (N - i - 1) * out_stride); } } template static void LoadFromBlock(const Block& in, size_t off, float* JXL_RESTRICT coeff) { for (size_t i = 0; i < N; i++) { Store(in.LoadPart(FV(), i, off), FV(), coeff + i * SZ); } } template static void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, const Block& out, size_t off) { auto mul = Set(FV(), 1.0f / N); for (size_t i = 0; i < N; i++) { out.StorePart(FV(), Mul(mul, Load(FV(), coeff + i * SZ)), i, off); } } }; template struct DCT1DImpl; template struct DCT1DImpl<1, SZ> { JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} }; template struct DCT1DImpl<2, SZ> { JXL_INLINE void operator()(float* JXL_RESTRICT mem) { auto in1 = Load(FV(), mem); auto in2 = Load(FV(), mem + SZ); Store(Add(in1, in2), FV(), mem); Store(Sub(in1, in2), FV(), mem + SZ); } }; template struct DCT1DImpl { void operator()(float* JXL_RESTRICT mem) { // This is relatively small (4kB with 64-DCT and AVX-512) HWY_ALIGN float tmp[N * SZ]; CoeffBundle::AddReverse(mem, mem + N / 2 * SZ, tmp); DCT1DImpl()(tmp); CoeffBundle::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ); CoeffBundle::Multiply(tmp); DCT1DImpl()(tmp + N / 2 * SZ); CoeffBundle::B(tmp + N / 2 * SZ); CoeffBundle::InverseEvenOdd(tmp, mem); } }; template struct IDCT1DImpl; template struct IDCT1DImpl<1, SZ> { JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, size_t to_stride) { StoreU(LoadU(FV(), from), FV(), to); } }; template struct IDCT1DImpl<2, SZ> { JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, size_t to_stride) { JXL_DASSERT(from_stride >= SZ); JXL_DASSERT(to_stride >= SZ); auto in1 = LoadU(FV(), from); auto in2 = LoadU(FV(), from + from_stride); StoreU(Add(in1, in2), FV(), to); StoreU(Sub(in1, in2), FV(), to + to_stride); } }; template struct IDCT1DImpl { void operator()(const float* from, size_t from_stride, float* to, size_t to_stride) { JXL_DASSERT(from_stride >= SZ); JXL_DASSERT(to_stride >= SZ); // This is relatively small (4kB with 64-DCT and AVX-512) HWY_ALIGN float tmp[N * SZ]; CoeffBundle::ForwardEvenOdd(from, from_stride, tmp); IDCT1DImpl()(tmp, SZ, tmp, SZ); CoeffBundle::BTranspose(tmp + N / 2 * SZ); IDCT1DImpl()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ); CoeffBundle::MultiplyAndAdd(tmp, to, to_stride); } }; template void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { size_t M = M_or_0 != 0 ? M_or_0 : Mp; constexpr size_t SZ = MaxLanes(FV()); HWY_ALIGN float tmp[N * SZ]; for (size_t i = 0; i < M; i += Lanes(FV())) { // TODO(veluca): consider removing the temporary memory here (as is done in // IDCT), if it turns out that some compilers don't optimize away the loads // and this is performance-critical. CoeffBundle::LoadFromBlock(from, i, tmp); DCT1DImpl()(tmp); CoeffBundle::StoreToBlockAndScale(tmp, to, i); } } template void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { size_t M = M_or_0 != 0 ? M_or_0 : Mp; constexpr size_t SZ = MaxLanes(FV()); for (size_t i = 0; i < M; i += Lanes(FV())) { IDCT1DImpl()(from.Address(0, i), from.Stride(), to.Address(0, i), to.Stride()); } } template struct DCT1D { template void operator()(const FromBlock& from, const ToBlock& to) { return DCT1DWrapper(from, to, M); } }; template struct DCT1D MaxLanes(FV<0>()))>::type> { template void operator()(const FromBlock& from, const ToBlock& to) { return NoInlineWrapper(DCT1DWrapper, from, to, M); } }; template struct IDCT1D { template void operator()(const FromBlock& from, const ToBlock& to) { return IDCT1DWrapper(from, to, M); } }; template struct IDCT1D MaxLanes(FV<0>()))>::type> { template void operator()(const FromBlock& from, const ToBlock& to) { return NoInlineWrapper(IDCT1DWrapper, from, to, M); } }; // Computes the maybe-transposed, scaled DCT of a block, that needs to be // HWY_ALIGN'ed. template struct ComputeScaledDCT { // scratch_space must be aligned, and should have space for ROWS*COLS // floats. template HWY_MAYBE_UNUSED void operator()(const From& from, float* to, float* JXL_RESTRICT scratch_space) { float* JXL_RESTRICT block = scratch_space; if (ROWS < COLS) { DCT1D()(from, DCTTo(block, COLS)); Transpose::Run(DCTFrom(block, COLS), DCTTo(to, ROWS)); DCT1D()(DCTFrom(to, ROWS), DCTTo(block, ROWS)); Transpose::Run(DCTFrom(block, ROWS), DCTTo(to, COLS)); } else { DCT1D()(from, DCTTo(to, COLS)); Transpose::Run(DCTFrom(to, COLS), DCTTo(block, ROWS)); DCT1D()(DCTFrom(block, ROWS), DCTTo(to, ROWS)); } } }; // Computes the maybe-transposed, scaled IDCT of a block, that needs to be // HWY_ALIGN'ed. template struct ComputeScaledIDCT { // scratch_space must be aligned, and should have space for ROWS*COLS // floats. template HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, float* JXL_RESTRICT scratch_space) { float* JXL_RESTRICT block = scratch_space; // Reverse the steps done in ComputeScaledDCT. if (ROWS < COLS) { Transpose::Run(DCTFrom(from, COLS), DCTTo(block, ROWS)); IDCT1D()(DCTFrom(block, ROWS), DCTTo(from, ROWS)); Transpose::Run(DCTFrom(from, ROWS), DCTTo(block, COLS)); IDCT1D()(DCTFrom(block, COLS), to); } else { IDCT1D()(DCTFrom(from, ROWS), DCTTo(block, ROWS)); Transpose::Run(DCTFrom(block, ROWS), DCTTo(from, COLS)); IDCT1D()(DCTFrom(from, COLS), to); } } }; } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace jxl HWY_AFTER_NAMESPACE(); #endif // LIB_JXL_DCT_INL_H_