From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc | 1168 ++++++++++++++++++++++++ 1 file changed, 1168 insertions(+) create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc (limited to 'third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc') diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc new file mode 100644 index 0000000000..2b4d84196f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc @@ -0,0 +1,1168 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ac_strategy.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/fast_math-inl.h" + +// Some of the floating point constants in this file and in other +// files in the libjxl project have been obtained using the +// tools/optimizer/simplex_fork.py tool. It is a variation of +// Nelder-Mead optimization, and we generally try to minimize +// BPP * pnorm aggregate as reported by the benchmark_xl tool, +// but occasionally the values are optimized by using additional +// constraints such as maintaining a certain density, or ratio of +// popularity of integral transforms. Jyrki visually reviews all +// such changes and often makes manual changes to maintain good +// visual quality to changes where butteraugli was not sufficiently +// sensitive to some kind of degradation. Unfortunately image quality +// is still more of an art than science. + +// This must come before the begin/end_target, but HWY_ONCE is only true +// after that, so use an "include guard". +#ifndef LIB_JXL_ENC_AC_STRATEGY_ +#define LIB_JXL_ENC_AC_STRATEGY_ +// Parameters of the heuristic are marked with a OPTIMIZE comment. +namespace jxl { + +// Debugging utilities. + +// Returns a linear sRGB color (as bytes) for each AC strategy. +const uint8_t* TypeColor(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors"); + static constexpr uint8_t kColors[][3] = { + {0xFF, 0xFF, 0x00}, // DCT8 + {0xFF, 0x80, 0x80}, // HORNUSS + {0xFF, 0x80, 0x80}, // DCT2x2 + {0xFF, 0x80, 0x80}, // DCT4x4 + {0x80, 0xFF, 0x00}, // DCT16x16 + {0x00, 0xC0, 0x00}, // DCT32x32 + {0xC0, 0xFF, 0x00}, // DCT16x8 + {0xC0, 0xFF, 0x00}, // DCT8x16 + {0x00, 0xFF, 0x00}, // DCT32x8 + {0x00, 0xFF, 0x00}, // DCT8x32 + {0x00, 0xFF, 0x00}, // DCT32x16 + {0x00, 0xFF, 0x00}, // DCT16x32 + {0xFF, 0x80, 0x00}, // DCT4x8 + {0xFF, 0x80, 0x00}, // DCT8x4 + {0xFF, 0xFF, 0x80}, // AFV0 + {0xFF, 0xFF, 0x80}, // AFV1 + {0xFF, 0xFF, 0x80}, // AFV2 + {0xFF, 0xFF, 0x80}, // AFV3 + {0x00, 0xC0, 0xFF}, // DCT64x64 + {0x00, 0xFF, 0xFF}, // DCT64x32 + {0x00, 0xFF, 0xFF}, // DCT32x64 + {0x00, 0x40, 0xFF}, // DCT128x128 + {0x00, 0x80, 0xFF}, // DCT128x64 + {0x00, 0x80, 0xFF}, // DCT64x128 + {0x00, 0x00, 0xC0}, // DCT256x256 + {0x00, 0x00, 0xFF}, // DCT256x128 + {0x00, 0x00, 0xFF}, // DCT128x256 + }; + return kColors[raw_strategy]; +} + +const uint8_t* TypeMask(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks"); + // implicitly, first row and column is made dark + static constexpr uint8_t kMask[][64] = { + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT8 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 1, 1, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // HORNUSS + { + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + }, // 2x2 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // 4x4 + {}, // DCT16x16 (unused) + {}, // DCT32x32 (unused) + {}, // DCT16x8 (unused) + {}, // DCT8x16 (unused) + {}, // DCT32x8 (unused) + {}, // DCT8x32 (unused) + {}, // DCT32x16 (unused) + {}, // DCT16x32 (unused) + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT4x8 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // DCT8x4 + { + 1, 1, 1, 1, 1, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV0 + { + 0, 0, 0, 0, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV1 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + }, // AFV2 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + }, // AFV3 + }; + return kMask[raw_strategy]; +} + +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, + size_t ysize, const char* tag, AuxOut* aux_out) { + Image3F color_acs(xsize, ysize); + for (size_t y = 0; y < ysize; y++) { + float* JXL_RESTRICT rows[3] = { + color_acs.PlaneRow(0, y), + color_acs.PlaneRow(1, y), + color_acs.PlaneRow(2, y), + }; + const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); + for (size_t x = 0; x < xsize; x++) { + AcStrategy acs = acs_row[x / kBlockDim]; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + for (size_t c = 0; c < 3; c++) { + rows[c][x] = color[c] / 255.f; + } + } + } + size_t stride = color_acs.PixelsPerRow(); + for (size_t c = 0; c < 3; c++) { + for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { + float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); + const AcStrategyRow acs_row = ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); + if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { + for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; + iy++) { + for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; + ix++) { + if (mask[iy * kBlockDim + ix]) { + row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; + } + } + } + } + // draw block edges + for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && + bx * kBlockDim + ix < xsize; + ix++) { + row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; + } + for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && + by * kBlockDim + iy < ysize; + iy++) { + row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; + } + } + } + } + aux_out->DumpImage(tag, color_acs); +} + +} // namespace jxl +#endif // LIB_JXL_ENC_AC_STRATEGY_ + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::AbsDiff; +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::IfThenElseZero; +using hwy::HWY_NAMESPACE::IfThenZeroElse; +using hwy::HWY_NAMESPACE::Round; +using hwy::HWY_NAMESPACE::Sqrt; + +bool MultiBlockTransformCrossesHorizontalBoundary( + const AcStrategyImage& ac_strategy, size_t start_x, size_t y, + size_t end_x) { + if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { + return false; + } + if (y % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_x = std::min(end_x, ac_strategy.xsize()); + // The first multiblock might be before the start_x, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + AcStrategyRow row = ac_strategy.ConstRow(y); + const size_t start_x_limit = start_x & ~7; + while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { + --start_x; + } + for (size_t x = start_x; x < end_x;) { + if (row[x].IsFirstBlock()) { + x += row[x].covered_blocks_x(); + } else { + return true; + } + } + return false; +} + +bool MultiBlockTransformCrossesVerticalBoundary( + const AcStrategyImage& ac_strategy, size_t x, size_t start_y, + size_t end_y) { + if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { + return false; + } + if (x % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_y = std::min(end_y, ac_strategy.ysize()); + // The first multiblock might be before the start_y, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + const size_t start_y_limit = start_y & ~7; + while (start_y != start_y_limit && + !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { + --start_y; + } + + for (size_t y = start_y; y < end_y;) { + AcStrategyRow row = ac_strategy.ConstRow(y); + if (row[x].IsFirstBlock()) { + y += row[x].covered_blocks_y(); + } else { + return true; + } + } + return false; +} + +static const float kChromaErrorWeight[AcStrategy::kNumValidStrategies] = { + 0.95f, // DCT = 0, + 1.0f, // IDENTITY = 1, + 0.5f, // DCT2X2 = 2, + 1.0f, // DCT4X4 = 3, + 2.0f, // DCT16X16 = 4, + 2.0f, // DCT32X32 = 5, + 1.4f, // DCT16X8 = 6, + 1.4f, // DCT8X16 = 7, + 2.0f, // DCT32X8 = 8, + 2.0f, // DCT8X32 = 9, + 2.0f, // DCT32X16 = 10, + 2.0f, // DCT16X32 = 11, + 2.0f, // DCT4X8 = 12, + 2.0f, // DCT8X4 = 13, + 1.7f, // AFV0 = 14, + 1.7f, // AFV1 = 15, + 1.7f, // AFV2 = 16, + 1.7f, // AFV3 = 17, + 2.0f, // DCT64X64 = 18, + 2.0f, // DCT64X32 = 19, + 2.0f, // DCT32X64 = 20, + 2.0f, // DCT128X128 = 21, + 2.0f, // DCT128X64 = 22, + 2.0f, // DCT64X128 = 23, + 2.0f, // DCT256X256 = 24, + 2.0f, // DCT256X128 = 25, + 2.0f, // DCT128X256 = 26, +}; + +float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, float* block, + float* scratch_space, uint32_t* quantized) { + const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; + + // Apply transform. + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT block_c = block + size * c; + TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), + config.src_stride, block_c, scratch_space); + } + + HWY_FULL(float) df; + + const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); + // avoid large blocks when there is a lot going on in red-green. + float cmul[3] = {kChromaErrorWeight[acs.RawStrategy()], 1.0f, 1.0f}; + float quant_norm8 = 0; + float masking = 0; + if (num_blocks == 1) { + // When it is only one 8x8, we don't need aggregation of values. + quant_norm8 = config.Quant(x / 8, y / 8); + masking = 2.0f * config.Masking(x / 8, y / 8); + } else if (num_blocks == 2) { + // Taking max instead of 8th norm seems to work + // better for smallest blocks up to 16x8. Jyrki couldn't get + // improvements in trying the same for 16x16 blocks. + if (acs.covered_blocks_y() == 2) { + quant_norm8 = + std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), + config.Masking(x / 8, y / 8 + 1)); + } else { + quant_norm8 = + std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), + config.Masking(x / 8 + 1, y / 8)); + } + } else { + float masking_norm2 = 0; + float masking_max = 0; + // Load QF value, calculate empirical heuristic on masking field + // for weighting the information loss. Information loss manifests + // itself as ringing, and masking could hide it. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + float qval = config.Quant(x / 8 + ix, y / 8 + iy); + qval *= qval; + qval *= qval; + quant_norm8 += qval * qval; + float maskval = config.Masking(x / 8 + ix, y / 8 + iy); + masking_max = std::max(masking_max, maskval); + masking_norm2 += maskval * maskval; + } + } + quant_norm8 /= num_blocks; + quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f); + masking_norm2 = sqrt(masking_norm2 / num_blocks); + // This is a highly empirical formula. + masking = (masking_norm2 + masking_max); + } + const auto q = Set(df, quant_norm8); + + // Compute entropy. + float entropy = config.base_entropy; + auto info_loss = Zero(df); + auto info_loss2 = Zero(df); + + for (size_t c = 0; c < 3; c++) { + const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c); + const auto cmap_factor = Set(df, cmap_factors[c]); + + auto entropy_v = Zero(df); + auto nzeros_v = Zero(df); + auto cost1 = Set(df, config.cost1); + auto cost2 = Set(df, config.cost2); + auto cost_delta = Set(df, config.cost_delta); + for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { + const auto in = Load(df, block + c * size + i); + const auto in_y = Mul(Load(df, block + size + i), cmap_factor); + const auto im = Load(df, inv_matrix + i); + const auto val = Mul(Sub(in, in_y), Mul(im, q)); + const auto rval = Round(val); + const auto diff = AbsDiff(val, rval); + info_loss = Add(info_loss, diff); + info_loss2 = MulAdd(diff, diff, info_loss2); + const auto q = Abs(rval); + const auto q_is_zero = Eq(q, Zero(df)); + entropy_v = Add(entropy_v, IfThenElseZero(Ge(q, Set(df, 1.5f)), cost2)); + // We used to have q * C here, but that cost model seems to + // be punishing large values more than necessary. Sqrt tries + // to avoid large values less aggressively. Having high accuracy + // around zero is most important at low qualities, and there + // we have directly specified costs for 0, 1, and 2. + entropy_v = MulAdd(Sqrt(q), cost_delta, entropy_v); + nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); + } + entropy_v = MulAdd(nzeros_v, cost1, entropy_v); + + entropy += cmul[c] * GetLane(SumOfLanes(df, entropy_v)); + size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); + // Add #bit of num_nonzeros, as an estimate of the cost for encoding the + // number of non-zeros of the block. + size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; + // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a + // bias. + entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); + } + float ret = + entropy + + masking * + ((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) + + (config.info_loss_multiplier2 * + sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2))))); + return ret; +} + +uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + float* block, float* scratch_space, + uint32_t* quantized, float* entropy_out) { + struct TransformTry8x8 { + AcStrategy::Type type; + int encoding_speed_tier_max_limit; + float entropy_add; + float entropy_mul; + }; + static const TransformTry8x8 kTransforms8x8[] = { + { + AcStrategy::Type::DCT, + 9, + 3.0f, + 0.745f, + }, + { + AcStrategy::Type::DCT4X4, + 5, + 4.0f, + 0.7f, + }, + { + AcStrategy::Type::DCT2X2, + 5, + 0.0f, + 0.66f, + }, + { + AcStrategy::Type::DCT4X8, + 4, + 0.0f, + 0.700754622182473063f, + }, + { + AcStrategy::Type::DCT8X4, + 4, + 0.0f, + 0.700754622182473063f, + }, + { + AcStrategy::Type::IDENTITY, + 5, + 8.0f, + 0.81217614513585534f, + }, + { + AcStrategy::Type::AFV0, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV1, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV2, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV3, + 4, + 3.0f, + 0.70086131125719425f, + }, + }; + double best = 1e30; + uint8_t best_tx = kTransforms8x8[0].type; + for (auto tx : kTransforms8x8) { + if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + float entropy = EstimateEntropy(acs, x, y, config, cmap_factors, block, + scratch_space, quantized); + entropy = tx.entropy_add + tx.entropy_mul * entropy; + if (entropy < best) { + best_tx = tx.type; + best = entropy; + } + } + *entropy_out = best; + return best_tx; +} + +// bx, by addresses the 64x64 block at 8x8 subresolution +// cx, cy addresses the left, upper 8x8 block position of the candidate +// transform. +void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + const float entropy_mul, const uint8_t candidate_priority, + uint8_t* priority, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + float entropy_current = 0; + for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { + if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { + // Transform would reuse already allocated blocks and + // lead to invalid overlaps, for example DCT64X32 vs. + // DCT32X64. + return; + } + entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; + } + } + float entropy_candidate = + entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config, + cmap_factors, block, scratch_space, + quantized); + if (entropy_candidate >= entropy_current) return; + // Accept the candidate. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; + priority[(cy + iy) * 8 + cx + ix] = candidate_priority; + } + } + ac_strategy->Set(bx + cx, by + cy, acs_raw); + entropy_estimate[cy * 8 + cx] = entropy_candidate; +} + +static void SetEntropyForTransform(size_t cx, size_t cy, + const AcStrategy::Type acs_raw, + float entropy, + float* JXL_RESTRICT entropy_estimate) { + const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { + entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; + } + } + entropy_estimate[cy * 8 + cx] = entropy; +} + +AcStrategy::Type AcsSquare(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X32; + } else { + return AcStrategy::Type::DCT64X64; + } +} + +AcStrategy::Type AcsVerticalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X8; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X16; + } else { + return AcStrategy::Type::DCT64X32; + } +} + +AcStrategy::Type AcsHorizontalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT8X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT16X32; + } else { + return AcStrategy::Type::DCT32X64; + } +} + +// The following function tries to merge smaller transforms into +// squares and the rectangles originating from a single middle division +// (horizontal or vertical) fairly. +// +// This is now generalized to concern about squares +// of blocks X blocks size, where a block is 8x8 pixels. +void FindBestFirstLevelDivisionForSquare( + size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK, + const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + // We denote J for the larger dimension here, and K for the smaller. + // For example, for 32x32 block splitting, J would be 32, K 16. + const size_t blocks_half = blocks / 2; + const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks); + const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks); + const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks); + const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); + const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); + const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); + AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); + AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); + // Let's check if we can consider a JXJ block here at all. + // This is not necessary in the basic use of hierarchically merging + // blocks in the simplest possible way, but is needed when we try other + // 'floating' options of merging, possibly after a simple hierarchical + // merge has been explored. + if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, + by + cy, bx + cx + blocks) || + MultiBlockTransformCrossesHorizontalBoundary( + *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, + by + cy + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, + by + cy, by + cy + blocks)) { + return; // not suitable for JxJ analysis, some transforms leak out. + } + // For floating transforms there may be + // already blocks selected that make either or both JXK and + // KXJ not feasible for this location. + const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( + *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); + const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( + *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); + // Current entropies aggregated on NxN resolution. + float entropy[2][2] = {}; + for (size_t dy = 0; dy < blocks; ++dy) { + for (size_t dx = 0; dx < blocks; ++dx) { + entropy[dy / blocks_half][dx / blocks_half] += + entropy_estimate[(cy + dy) * 8 + (cx + dx)]; + } + } + float entropy_JXK_left = std::numeric_limits::max(); + float entropy_JXK_right = std::numeric_limits::max(); + float entropy_KXJ_top = std::numeric_limits::max(); + float entropy_KXJ_bottom = std::numeric_limits::max(); + float entropy_JXJ = std::numeric_limits::max(); + if (allow_JXK) { + if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) { + entropy_JXK_left = + entropy_mul_JXK * + EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) { + entropy_JXK_right = + entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8, + (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, + quantized); + } + } + if (allow_KXJ) { + if (row0[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_top = + entropy_mul_JXK * + EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row1[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_bottom = + entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, + (by + cy + blocks_half) * 8, config, + cmap_factors, block, scratch_space, + quantized); + } + } + if (allow_square_transform) { + // We control the exploration of the square transform separately so that + // we can turn it off at high decoding speeds for 32x32, but still allow + // exploring 16x32 and 32x16. + entropy_JXJ = entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8, + (by + cy + 0) * 8, config, + cmap_factors, block, + scratch_space, quantized); + } + + // Test if this block should have JXK or KXJ transforms, + // because it can have only one or the other. + float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + + std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); + float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + + std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); + if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ); + SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); + } else if (costJxN < costNxJ) { + if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXK); + SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, + entropy_estimate); + } + if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { + ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK); + SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, + entropy_JXK_right, entropy_estimate); + } + } else { + if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ); + SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, + entropy_estimate); + } + if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { + ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ); + SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, + entropy_KXJ_bottom, entropy_estimate); + } + } +} + +void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state, + const ACSConfig& config, const Rect& rect) { + // Main philosophy here: + // 1. First find best 8x8 transform for each area. + // 2. Merging them into larger transforms where possibly, but + // starting from the smallest transforms (16x8 and 8x16). + // Additional complication: 16x8 and 8x16 are considered + // simultanouesly and fairly against each other. + // We are looking at 64x64 squares since the YtoX and YtoB + // maps happen to be at that resolution, and having + // integral transforms cross these boundaries leads to + // additional complications. + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy; + // TODO(veluca): reuse allocations + auto mem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + auto qmem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint32_t* JXL_RESTRICT quantized = qmem.get(); + float* JXL_RESTRICT block = mem.get(); + float* JXL_RESTRICT scratch_space = mem.get() + 3 * AcStrategy::kMaxCoeffArea; + size_t bx = rect.x0(); + size_t by = rect.y0(); + JXL_ASSERT(rect.xsize() <= 8); + JXL_ASSERT(rect.ysize() <= 8); + size_t tx = bx / kColorTileDimInBlocks; + size_t ty = by / kColorTileDimInBlocks; + const float cmap_factors[3] = { + enc_state->shared.cmap.YtoXRatio( + enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]), + 0.0f, + enc_state->shared.cmap.YtoBRatio( + enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]), + }; + if (cparams.speed_tier > SpeedTier::kHare) return; + // First compute the best 8x8 transform for each square. Later, we do not + // experiment with different combinations, but only use the best of the 8x8s + // when DCT8X8 is specified in the tree search. + // 8x8 transforms have 10 variants, but every larger transform is just a DCT. + float entropy_estimate[64] = {}; + // Favor all 8x8 transforms (against 16x8 and larger transforms)) at + // low butteraugli_target distances. + static const float k8x8mul1 = -0.55; + static const float k8x8mul2 = 1.0; + static const float k8x8base = 1.4; + const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); + for (size_t iy = 0; iy < rect.ysize(); iy++) { + for (size_t ix = 0; ix < rect.xsize(); ix++) { + float entropy = 0.0; + const uint8_t best_of_8x8s = FindBest8x8Transform( + 8 * (bx + ix), 8 * (by + iy), static_cast(cparams.speed_tier), + config, cmap_factors, ac_strategy, block, scratch_space, quantized, + &entropy); + ac_strategy->Set(bx + ix, by + iy, + static_cast(best_of_8x8s)); + entropy_estimate[iy * 8 + ix] = entropy * mul8x8; + } + } + // Merge when a larger transform is better than the previously + // searched best combination of 8x8 transforms. + struct MergeTry { + AcStrategy::Type type; + uint8_t priority; + uint8_t decoding_speed_tier_max_limit; + uint8_t encoding_speed_tier_max_limit; + float entropy_mul; + }; + static const float k8X16mul1 = -0.55; + static const float k8X16mul2 = 0.865; + static const float k8X16base = 1.6; + const float entropy_mul16X8 = + k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base); + // const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f; + + static const float k16X16mul1 = -0.35; + static const float k16X16mul2 = 0.798; + static const float k16X16base = 2.0; + const float entropy_mul16X16 = + k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base); + // const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f; + + static const float k32X16mul1 = -0.1; + static const float k32X16mul2 = 0.854; + static const float k32X16base = 2.5; + const float entropy_mul16X32 = + k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base); + + const float entropy_mul32X32 = 0.93; + const float entropy_mul64X64 = 1.52f; + // TODO(jyrki): Consider this feedback in further changes: + // Also effectively when the multipliers for smaller blocks are + // below 1, this raises the bar for the bigger blocks even higher + // in that sense these constants are not independent (e.g. changing + // the constant for DCT16x32 by -5% (making it more likely) also + // means that DCT32x32 becomes harder to do when starting from + // two DCT16x32s). It might be better to make them more independent, + // e.g. by not applying the multiplier when storing the new entropy + // estimates in TryMergeToACSCandidate(). + const MergeTry kTransformsForMerge[9] = { + {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8}, + {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8}, + // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its + // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16}, + {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32}, + {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32}, + // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its + // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5, + // 0.9822994906548809f}, + {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.29f}, + {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.29f}, + // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f}, + }; + /* + These sizes not yet included in merge heuristic: + set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f); + set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f); + set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f); + set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f); + */ + + // Priority is a tricky kludge to avoid collisions so that transforms + // don't overlap. + uint8_t priority[64] = {}; + bool enable_32x32 = cparams.decoding_speed_tier < 4; + for (auto tx : kTransformsForMerge) { + if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + + for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); + cy += acs.covered_blocks_y()) { + for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); + cx += acs.covered_blocks_x()) { + if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { + if (cparams.decoding_speed_tier < 4 && + tx.type == AcStrategy::Type::DCT32X64) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 8 == 0) { + FindBestFirstLevelDivisionForSquare( + 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, entropy_mul64X64, entropy_estimate, block, + scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + + if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT16X32) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 4 == 0) { + FindBestFirstLevelDivisionForSquare( + 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, + ac_strategy, tx.entropy_mul, entropy_mul32X32, + entropy_estimate, block, scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT8X16) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 2 == 0) { + FindBestFirstLevelDivisionForSquare( + 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, entropy_mul16X16, entropy_estimate, block, + scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT16X8) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) || + (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) { + // already covered by FindBestFirstLevelDivisionForSquare + continue; + } + // All other merge sizes are handled here. + // Some of the DCT16X8s and DCT8X16s will still leak through here + // when there is an odd number of 8x8 blocks, then the last row + // and column will get their DCT16X8s and DCT8X16s through the + // normal integral transform merging process. + TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, tx.priority, &priority[0], entropy_estimate, + block, scratch_space, quantized); + } + } + } + if (cparams.speed_tier >= SpeedTier::kHare) { + return; + } + // Here we still try to do some non-aligned matching, find a few more + // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. + for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { + for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { + if ((cy | cx) % 2 != 0) { + FindBestFirstLevelDivisionForSquare( + 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, + scratch_space, quantized); + } + } + } + // Non-aligned matching for 32X32, 16X32 and 32X16. + size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; + for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { + for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { + if ((cy | cx) % 4 == 0) { + continue; // Already tried with loop above (DCT16X32 case). + } + FindBestFirstLevelDivisionForSquare( + 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, + entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, + scratch_space, quantized); + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ProcessRectACS); + +void AcStrategyHeuristics::Init(const Image3F& src, + PassesEncoderState* enc_state) { + this->enc_state = enc_state; + config.dequant = &enc_state->shared.matrices; + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + + if (cparams.speed_tier >= SpeedTier::kCheetah) { + JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1)); // DCT8 only + } else { + uint32_t acs_mask = 0; + // All transforms up to 64x64. + for (size_t i = 0; i < AcStrategy::DCT128X128; i++) { + acs_mask |= (1 << i); + } + JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask)); + } + + // Image row pointers and strides. + config.quant_field_row = enc_state->initial_quant_field.Row(0); + config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow(); + auto& mask = enc_state->initial_quant_masking; + if (mask.xsize() > 0 && mask.ysize() > 0) { + config.masking_field_row = mask.Row(0); + config.masking_field_stride = mask.PixelsPerRow(); + } + + config.src_rows[0] = src.ConstPlaneRow(0, 0); + config.src_rows[1] = src.ConstPlaneRow(1, 0); + config.src_rows[2] = src.ConstPlaneRow(2, 0); + config.src_stride = src.PixelsPerRow(); + + // Entropy estimate is composed of two factors: + // - estimate of the number of bits that will be used by the block + // - information loss due to quantization + // The following constant controls the relative weights of these components. + config.info_loss_multiplier = 138.0f; + config.info_loss_multiplier2 = 50.46839691767866; + // TODO(jyrki): explore base_entropy setting more. + // A small value (0?) works better at high distance, while a larger value + // may be more effective at low distance/high bpp. + config.base_entropy = 0.0; + config.zeros_mul = 7.565053364251793f; + // Lots of +1 and -1 coefficients at high quality, it is + // beneficial to favor them. At low qualities zeros matter more + // and +1 / -1 coefficients are already quite harmful. + float slope = std::min(1.0f, butteraugli_target * (1.0f / 3)); + config.cost1 = 1 + slope * 8.8703248061477744f; + config.cost2 = 4.4628149885273363f; + config.cost_delta = 5.3359184934516337f; + JXL_ASSERT(enc_state->shared.ac_strategy.xsize() == + enc_state->shared.frame_dim.xsize_blocks); + JXL_ASSERT(enc_state->shared.ac_strategy.ysize() == + enc_state->shared.frame_dim.ysize_blocks); +} + +void AcStrategyHeuristics::ProcessRect(const Rect& rect) { + PROFILER_FUNC; + const CompressParams& cparams = enc_state->cparams; + // In Falcon mode, use DCT8 everywhere and uniform quantization. + if (cparams.speed_tier >= SpeedTier::kCheetah) { + enc_state->shared.ac_strategy.FillDCT8(rect); + return; + } + HWY_DYNAMIC_DISPATCH(ProcessRectACS) + (enc_state, config, rect); +} + +void AcStrategyHeuristics::Finalize(AuxOut* aux_out) { + const auto& ac_strategy = enc_state->shared.ac_strategy; + // Accounting and debug output. + if (aux_out != nullptr) { + aux_out->num_small_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::IDENTITY) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT2X2) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT4X4); + aux_out->num_dct4x8_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4); + aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV1) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV2) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV3); + aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT); + aux_out->num_dct8x16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8); + aux_out->num_dct8x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8); + aux_out->num_dct16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16); + aux_out->num_dct16x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16); + aux_out->num_dct32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32); + aux_out->num_dct32x64_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32); + aux_out->num_dct64_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64); + } + + if (WantDebugOutput(aux_out)) { + DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize, + enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out); + } +} + +} // namespace jxl +#endif // HWY_ONCE -- cgit v1.2.3