// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #if defined(LIB_JPEGLI_ENTROPY_CODING_INL_H_) == defined(HWY_TARGET_TOGGLE) #ifdef LIB_JPEGLI_ENTROPY_CODING_INL_H_ #undef LIB_JPEGLI_ENTROPY_CODING_INL_H_ #else #define LIB_JPEGLI_ENTROPY_CODING_INL_H_ #endif #include "lib/jxl/base/compiler_specific.h" HWY_BEFORE_NAMESPACE(); namespace jpegli { namespace HWY_NAMESPACE { namespace { // These templates are not found via ADL. using hwy::HWY_NAMESPACE::Abs; using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::And; using hwy::HWY_NAMESPACE::AndNot; using hwy::HWY_NAMESPACE::Compress; using hwy::HWY_NAMESPACE::CountTrue; using hwy::HWY_NAMESPACE::Eq; using hwy::HWY_NAMESPACE::GetLane; using hwy::HWY_NAMESPACE::MaskFromVec; using hwy::HWY_NAMESPACE::Max; using hwy::HWY_NAMESPACE::Not; using hwy::HWY_NAMESPACE::Or; using hwy::HWY_NAMESPACE::ShiftRight; using hwy::HWY_NAMESPACE::Shl; using hwy::HWY_NAMESPACE::Sub; using DI = HWY_FULL(int32_t); constexpr DI di; template JXL_INLINE V NumBits(DI di, const V x) { // TODO(szabadka) Add faster implementations for some specific architectures. const auto b1 = And(x, Set(di, 1)); const auto b2 = And(x, Set(di, 2)); const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1)); const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4)); const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11)); const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26)); const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57)); const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120)); const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247)); const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502)); const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013)); const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036)); return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))), Max(Max(b9, b10), Max(b11, b12))); } // Coefficient indexes pre-multiplied by 16 for the symbol calculation. HWY_ALIGN constexpr int32_t kIndexes[64] = { 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, }; JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block, int32_t* JXL_RESTRICT nonzero_idx) { const auto zero = Zero(di); HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1}; const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes)); int num_nonzeros = 0; int k = 0; { const auto coef = Load(di, block); const auto idx = Load(di, kIndexes); const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero))); const auto nzero_coef = Compress(coef, nonzero_mask); const auto nzero_idx = Compress(idx, nonzero_mask); StoreU(nzero_coef, di, &block[num_nonzeros]); StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]); num_nonzeros += CountTrue(di, nonzero_mask); k += Lanes(di); } for (; k < DCTSIZE2; k += Lanes(di)) { const auto coef = Load(di, &block[k]); const auto idx = Load(di, &kIndexes[k]); const auto nonzero_mask = Not(Eq(coef, zero)); const auto nzero_coef = Compress(coef, nonzero_mask); const auto nzero_idx = Compress(idx, nonzero_mask); StoreU(nzero_coef, di, &block[num_nonzeros]); StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]); num_nonzeros += CountTrue(di, nonzero_mask); } return num_nonzeros; } JXL_INLINE void ComputeSymbols(const int num_nonzeros, int32_t* JXL_RESTRICT nonzero_idx, int32_t* JXL_RESTRICT block, int32_t* JXL_RESTRICT symbols) { nonzero_idx[-1] = -16; const auto one = Set(di, 1); const auto offset = Set(di, 16); for (int i = 0; i < num_nonzeros; i += Lanes(di)) { const auto idx = Load(di, &nonzero_idx[i]); const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]); const auto coeff = Load(di, &block[i]); const auto nbits = NumBits(di, Abs(coeff)); const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff); const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one)); const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset)); Store(symbol, di, symbols + i); Store(bits, di, block + i); } } template int NumNonZero8x8ExceptDC(const T* block) { const HWY_CAPPED(T, 8) di; const auto zero = Zero(di); // Add FFFF for every zero coefficient, negate to get #zeros. auto neg_sum_zero = zero; { // First row has DC, so mask const size_t y = 0; HWY_ALIGN const T dc_mask_lanes[8] = {-1}; for (size_t x = 0; x < 8; x += Lanes(di)) { const auto dc_mask = Load(di, dc_mask_lanes + x); // DC counts as zero so we don't include it in nzeros. const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x])); neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); } } // Remaining rows: no mask for (size_t y = 1; y < 8; y++) { for (size_t x = 0; x < 8; x += Lanes(di)) { const auto coef = Load(di, &block[y * 8 + x]); neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); } } // We want 64 - sum_zero, add because neg_sum_zero is already negated. return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero)); } template void ComputeTokensForBlock(const T* block, int last_dc, int dc_ctx, int ac_ctx, Token** tokens_ptr) { Token* next_token = *tokens_ptr; coeff_t temp2; coeff_t temp; temp = block[0] - last_dc; if (temp == 0) { *next_token++ = Token(dc_ctx, 0, 0); } else { temp2 = temp; if (temp < 0) { temp = -temp; temp2--; } int dc_nbits = jxl::FloorLog2Nonzero(temp) + 1; int dc_mask = (1 << dc_nbits) - 1; *next_token++ = Token(dc_ctx, dc_nbits, temp2 & dc_mask); } int num_nonzeros = NumNonZero8x8ExceptDC(block); for (int k = 1; k < 64; ++k) { if (num_nonzeros == 0) { *next_token++ = Token(ac_ctx, 0, 0); break; } int r = 0; if (zig_zag_order) { while ((temp = block[k]) == 0) { r++; k++; } } else { while ((temp = block[kJPEGNaturalOrder[k]]) == 0) { r++; k++; } } --num_nonzeros; if (temp < 0) { temp = -temp; temp2 = ~temp; } else { temp2 = temp; } while (r > 15) { *next_token++ = Token(ac_ctx, 0xf0, 0); r -= 16; } int ac_nbits = jxl::FloorLog2Nonzero(temp) + 1; int ac_mask = (1 << ac_nbits) - 1; int symbol = (r << 4u) + ac_nbits; *next_token++ = Token(ac_ctx, symbol, temp2 & ac_mask); } *tokens_ptr = next_token; } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace } // namespace HWY_NAMESPACE } // namespace jpegli HWY_AFTER_NAMESPACE(); #endif // LIB_JPEGLI_ENTROPY_CODING_INL_H_