diff options
Diffstat (limited to 'third_party/jpeg-xl/lib/jpegli/entropy_coding-inl.h')
-rw-r--r-- | third_party/jpeg-xl/lib/jpegli/entropy_coding-inl.h | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jpegli/entropy_coding-inl.h b/third_party/jpeg-xl/lib/jpegli/entropy_coding-inl.h new file mode 100644 index 0000000000..bfb436d795 --- /dev/null +++ b/third_party/jpeg-xl/lib/jpegli/entropy_coding-inl.h @@ -0,0 +1,213 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JPEGLI_ENTROPY_CODING_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JPEGLI_ENTROPY_CODING_INL_H_ +#undef LIB_JPEGLI_ENTROPY_CODING_INL_H_ +#else +#define LIB_JPEGLI_ENTROPY_CODING_INL_H_ +#endif + +#include "lib/jxl/base/compiler_specific.h" + +HWY_BEFORE_NAMESPACE(); +namespace jpegli { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::AndNot; +using hwy::HWY_NAMESPACE::Compress; +using hwy::HWY_NAMESPACE::CountTrue; +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::MaskFromVec; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Not; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Shl; +using hwy::HWY_NAMESPACE::Sub; + +using DI = HWY_FULL(int32_t); +constexpr DI di; + +template <typename DI, class V> +JXL_INLINE V NumBits(DI di, const V x) { + // TODO(szabadka) Add faster implementations for some specific architectures. + const auto b1 = And(x, Set(di, 1)); + const auto b2 = And(x, Set(di, 2)); + const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1)); + const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4)); + const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11)); + const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26)); + const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57)); + const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120)); + const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247)); + const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502)); + const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013)); + const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036)); + return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))), + Max(Max(b9, b10), Max(b11, b12))); +} + +// Coefficient indexes pre-multiplied by 16 for the symbol calculation. +HWY_ALIGN constexpr int32_t kIndexes[64] = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, + 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, + 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, + 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, + 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, +}; + +JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block, + int32_t* JXL_RESTRICT nonzero_idx) { + const auto zero = Zero(di); + HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1}; + const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes)); + int num_nonzeros = 0; + int k = 0; + { + const auto coef = Load(di, block); + const auto idx = Load(di, kIndexes); + const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero))); + const auto nzero_coef = Compress(coef, nonzero_mask); + const auto nzero_idx = Compress(idx, nonzero_mask); + StoreU(nzero_coef, di, &block[num_nonzeros]); + StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]); + num_nonzeros += CountTrue(di, nonzero_mask); + k += Lanes(di); + } + for (; k < DCTSIZE2; k += Lanes(di)) { + const auto coef = Load(di, &block[k]); + const auto idx = Load(di, &kIndexes[k]); + const auto nonzero_mask = Not(Eq(coef, zero)); + const auto nzero_coef = Compress(coef, nonzero_mask); + const auto nzero_idx = Compress(idx, nonzero_mask); + StoreU(nzero_coef, di, &block[num_nonzeros]); + StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]); + num_nonzeros += CountTrue(di, nonzero_mask); + } + return num_nonzeros; +} + +JXL_INLINE void ComputeSymbols(const int num_nonzeros, + int32_t* JXL_RESTRICT nonzero_idx, + int32_t* JXL_RESTRICT block, + int32_t* JXL_RESTRICT symbols) { + nonzero_idx[-1] = -16; + const auto one = Set(di, 1); + const auto offset = Set(di, 16); + for (int i = 0; i < num_nonzeros; i += Lanes(di)) { + const auto idx = Load(di, &nonzero_idx[i]); + const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]); + const auto coeff = Load(di, &block[i]); + const auto nbits = NumBits(di, Abs(coeff)); + const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff); + const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one)); + const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset)); + Store(symbol, di, symbols + i); + Store(bits, di, block + i); + } +} + +template <typename T> +int NumNonZero8x8ExceptDC(const T* block) { + const HWY_CAPPED(T, 8) di; + + const auto zero = Zero(di); + // Add FFFF for every zero coefficient, negate to get #zeros. + auto neg_sum_zero = zero; + { + // First row has DC, so mask + const size_t y = 0; + HWY_ALIGN const T dc_mask_lanes[8] = {-1}; + + for (size_t x = 0; x < 8; x += Lanes(di)) { + const auto dc_mask = Load(di, dc_mask_lanes + x); + + // DC counts as zero so we don't include it in nzeros. + const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x])); + + neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); + } + } + // Remaining rows: no mask + for (size_t y = 1; y < 8; y++) { + for (size_t x = 0; x < 8; x += Lanes(di)) { + const auto coef = Load(di, &block[y * 8 + x]); + neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); + } + } + + // We want 64 - sum_zero, add because neg_sum_zero is already negated. + return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero)); +} + +template <typename T, bool zig_zag_order> +void ComputeTokensForBlock(const T* block, int last_dc, int dc_ctx, int ac_ctx, + Token** tokens_ptr) { + Token* next_token = *tokens_ptr; + coeff_t temp2; + coeff_t temp; + temp = block[0] - last_dc; + if (temp == 0) { + *next_token++ = Token(dc_ctx, 0, 0); + } else { + temp2 = temp; + if (temp < 0) { + temp = -temp; + temp2--; + } + int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1; + int dc_mask = (1 << dc_nbits) - 1; + *next_token++ = Token(dc_ctx, dc_nbits, temp2 & dc_mask); + } + int num_nonzeros = NumNonZero8x8ExceptDC(block); + for (int k = 1; k < 64; ++k) { + if (num_nonzeros == 0) { + *next_token++ = Token(ac_ctx, 0, 0); + break; + } + int r = 0; + if (zig_zag_order) { + while ((temp = block[k]) == 0) { + r++; + k++; + } + } else { + while ((temp = block[kJPEGNaturalOrder[k]]) == 0) { + r++; + k++; + } + } + --num_nonzeros; + if (temp < 0) { + temp = -temp; + temp2 = ~temp; + } else { + temp2 = temp; + } + while (r > 15) { + *next_token++ = Token(ac_ctx, 0xf0, 0); + r -= 16; + } + int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1; + int ac_mask = (1 << ac_nbits) - 1; + int symbol = (r << 4u) + ac_nbits; + *next_token++ = Token(ac_ctx, symbol, temp2 & ac_mask); + } + *tokens_ptr = next_token; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace +} // namespace HWY_NAMESPACE +} // namespace jpegli +HWY_AFTER_NAMESPACE(); +#endif // LIB_JPEGLI_ENTROPY_CODING_INL_H_ |