diff options
Diffstat (limited to 'third_party/jpeg-xl/lib/jpegli/render.cc')
-rw-r--r-- | third_party/jpeg-xl/lib/jpegli/render.cc | 802 |
1 files changed, 802 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jpegli/render.cc b/third_party/jpeg-xl/lib/jpegli/render.cc new file mode 100644 index 0000000000..026345552a --- /dev/null +++ b/third_party/jpeg-xl/lib/jpegli/render.cc @@ -0,0 +1,802 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jpegli/render.h" + +#include <string.h> + +#include <array> +#include <atomic> +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <hwy/aligned_allocator.h> + +#include "lib/jpegli/color_quantize.h" +#include "lib/jpegli/color_transform.h" +#include "lib/jpegli/decode_internal.h" +#include "lib/jpegli/error.h" +#include "lib/jpegli/idct.h" +#include "lib/jpegli/upsample.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +#ifdef MEMORY_SANITIZER +#define JXL_MEMORY_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define JXL_MEMORY_SANITIZER 1 +#else +#define JXL_MEMORY_SANITIZER 0 +#endif +#else +#define JXL_MEMORY_SANITIZER 0 +#endif + +#if JXL_MEMORY_SANITIZER +#include "sanitizer/msan_interface.h" +#endif + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jpegli/render.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jpegli { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::Gt; +using hwy::HWY_NAMESPACE::IfThenElseZero; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::NearestInt; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftLeftSame; +using hwy::HWY_NAMESPACE::ShiftRightSame; +using hwy::HWY_NAMESPACE::Vec; +using D = HWY_FULL(float); +using DI = HWY_FULL(int32_t); +constexpr D d; +constexpr DI di; + +void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs, + const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros, + int32_t* JXL_RESTRICT sumabs) { + for (size_t i = 0; i < coeffs_size; i += Lanes(d)) { + size_t k = i % DCTSIZE2; + const Rebind<int16_t, DI> di16; + const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i)); + const auto abs_coeff = Abs(coeff); + const auto not_0 = Gt(abs_coeff, Zero(di)); + const auto nzero = IfThenElseZero(not_0, Set(di, 1)); + Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k); + Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k); + } +} + +void DecenterRow(float* row, size_t xsize) { + const HWY_CAPPED(float, 8) df; + const auto c128 = Set(df, 128.0f / 255); + for (size_t x = 0; x < xsize; x += Lanes(df)) { + Store(Add(Load(df, row + x), c128), df, row + x); + } +} + +void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y, + size_t xsize) { + jpeg_decomp_master* m = cinfo->master; + if (!m->dither_[c]) return; + const float* dither_row = + &m->dither_[c][(y & m->dither_mask_) * m->dither_size_]; + for (size_t x = 0; x < xsize; ++x) { + row[x] += dither_row[x & m->dither_mask_]; + } +} + +template <typename T> +void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len, + size_t num_channels, float multiplier, T* output) { + const HWY_CAPPED(float, 8) d; + auto zero = Zero(d); + auto mul = Set(d, multiplier); + const Rebind<T, decltype(d)> du; +#if JXL_MEMORY_SANITIZER + const size_t padding = hwy::RoundUpTo(len, Lanes(d)) - len; + for (size_t c = 0; c < num_channels; ++c) { + __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); + } +#endif + if (num_channels == 1) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul); + StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]); + } + } else if (num_channels == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul); + auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul); + StoreInterleaved2(DemoteTo(du, NearestInt(v0)), + DemoteTo(du, NearestInt(v1)), du, &output[2 * i]); + } + } else if (num_channels == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul); + auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul); + auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul); + StoreInterleaved3(DemoteTo(du, NearestInt(v0)), + DemoteTo(du, NearestInt(v1)), + DemoteTo(du, NearestInt(v2)), du, &output[3 * i]); + } + } else if (num_channels == 4) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul); + auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul); + auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul); + auto v3 = Clamp(zero, Mul(LoadU(d, &input[3][x0 + i]), mul), mul); + StoreInterleaved4(DemoteTo(du, NearestInt(v0)), + DemoteTo(du, NearestInt(v1)), + DemoteTo(du, NearestInt(v2)), + DemoteTo(du, NearestInt(v3)), du, &output[4 * i]); + } + } +#if JXL_MEMORY_SANITIZER + __msan_poison(output + num_channels * len, + sizeof(output[0]) * num_channels * padding); +#endif +} + +void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len, + size_t num_channels, float* output) { + const HWY_CAPPED(float, 8) d; + if (num_channels == 1) { + memcpy(output, input[0] + x0, len * sizeof(output[0])); + } else if (num_channels == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved2(LoadU(d, &input[0][x0 + i]), + LoadU(d, &input[1][x0 + i]), d, &output[2 * i]); + } + } else if (num_channels == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved3(LoadU(d, &input[0][x0 + i]), + LoadU(d, &input[1][x0 + i]), + LoadU(d, &input[2][x0 + i]), d, &output[3 * i]); + } + } else if (num_channels == 4) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved4(LoadU(d, &input[0][x0 + i]), + LoadU(d, &input[1][x0 + i]), + LoadU(d, &input[2][x0 + i]), + LoadU(d, &input[3][x0 + i]), d, &output[4 * i]); + } + } +} + +static constexpr float kFSWeightMR = 7.0f / 16.0f; +static constexpr float kFSWeightBL = 3.0f / 16.0f; +static constexpr float kFSWeightBM = 5.0f / 16.0f; +static constexpr float kFSWeightBR = 1.0f / 16.0f; + +float LimitError(float error) { + float abserror = std::abs(error); + if (abserror > 48.0f) { + abserror = 32.0f; + } else if (abserror > 16.0f) { + abserror = 0.5f * abserror + 8.0f; + } + return error > 0.0f ? abserror : -abserror; +} + +void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[], + size_t xoffset, size_t len, size_t num_channels, + uint8_t* JXL_RESTRICT output) { + jpeg_decomp_master* m = cinfo->master; + uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_; + if (cinfo->quantize_colors && m->quant_pass_ == 1) { + float* error_row[kMaxComponents]; + float* next_error_row[kMaxComponents]; + if (cinfo->dither_mode == JDITHER_ORDERED) { + for (size_t c = 0; c < num_channels; ++c) { + DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline, + cinfo->output_width); + } + } else if (cinfo->dither_mode == JDITHER_FS) { + for (size_t c = 0; c < num_channels; ++c) { + if (cinfo->output_scanline % 2 == 0) { + error_row[c] = m->error_row_[c]; + next_error_row[c] = m->error_row_[c + kMaxComponents]; + } else { + error_row[c] = m->error_row_[c + kMaxComponents]; + next_error_row[c] = m->error_row_[c]; + } + memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float)); + } + } + const float mul = 255.0f; + if (cinfo->dither_mode != JDITHER_FS) { + StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); + } + for (size_t i = 0; i < len; ++i) { + uint8_t* pixel = &scratch_space[num_channels * i]; + if (cinfo->dither_mode == JDITHER_FS) { + for (size_t c = 0; c < num_channels; ++c) { + float val = rows[c][i] * mul + LimitError(error_row[c][i]); + pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val))); + } + } + int index = LookupColorIndex(cinfo, pixel); + output[i] = index; + if (cinfo->dither_mode == JDITHER_FS) { + size_t prev_i = i > 0 ? i - 1 : 0; + size_t next_i = i + 1 < len ? i + 1 : len - 1; + for (size_t c = 0; c < num_channels; ++c) { + float error = pixel[c] - cinfo->colormap[c][index]; + error_row[c][next_i] += kFSWeightMR * error; + next_error_row[c][prev_i] += kFSWeightBL * error; + next_error_row[c][i] += kFSWeightBM * error; + next_error_row[c][next_i] += kFSWeightBR * error; + } + } + } + } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) { + const float mul = 255.0; + StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); + memcpy(output, scratch_space, len * num_channels); + } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) { + const float mul = 65535.0; + uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space); + StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp); + if (m->swap_endianness_) { + const HWY_CAPPED(uint16_t, 8) du; + size_t output_len = len * num_channels; + for (size_t j = 0; j < output_len; j += Lanes(du)) { + auto v = LoadU(du, tmp + j); + auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); + StoreU(vswap, du, tmp + j); + } + } + memcpy(output, tmp, len * num_channels * 2); + } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) { + float* tmp = reinterpret_cast<float*>(scratch_space); + StoreFloatRow(rows, xoffset, len, num_channels, tmp); + if (m->swap_endianness_) { + size_t output_len = len * num_channels; + for (size_t j = 0; j < output_len; ++j) { + tmp[j] = BSwapFloat(tmp[j]); + } + } + memcpy(output, tmp, len * num_channels * 4); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jpegli +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jpegli { + +HWY_EXPORT(GatherBlockStats); +HWY_EXPORT(WriteToOutput); +HWY_EXPORT(DecenterRow); + +void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs, + const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros, + int32_t* JXL_RESTRICT sumabs) { + return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros, + sumabs); +} + +void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[], + size_t xoffset, size_t len, size_t num_channels, + uint8_t* JXL_RESTRICT output) { + return HWY_DYNAMIC_DISPATCH(WriteToOutput)(cinfo, rows, xoffset, len, + num_channels, output); +} + +void DecenterRow(float* row, size_t xsize) { + return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize); +} + +// Padding for horizontal chroma upsampling. +constexpr size_t kPaddingLeft = 64; +constexpr size_t kPaddingRight = 64; + +bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) { + const auto& compinfo = cinfo->comp_info[ci]; + return (compinfo.h_samp_factor == cinfo->max_h_samp_factor && + compinfo.v_samp_factor == cinfo->max_v_samp_factor); +} + +// See the following article for the details: +// J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression" +// Proceedings International Conference on Information Technology: Coding and +// Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179. +void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros, + const int* sumabs, float* biases) { + for (size_t k = 1; k < DCTSIZE2; ++k) { + if (nonzeros[k] == 0) { + biases[k] = 0.5f; + continue; + } + // Notation adapted from the article + float N = num_blocks; + float N1 = nonzeros[k]; + float N0 = num_blocks - N1; + float S = sumabs[k]; + // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just + // temporary grouping of terms. + float A = 4.0 * S + 2.0 * N; + float B = 4.0 * S - 2.0 * N1; + float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A; + float gamma2 = gamma * gamma; + // The bias is computed from gamma with (eq. 5), where the quantization + // multiplier Q can be factored out and thus the bias can be applied + // directly on the quantized coefficient. + biases[k] = + 0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma)); + } +} + +constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8, 16, 9, + 2, 3, 10, 17, 24}; + +bool is_nonzero_quantizers(const JQUANT_TBL* qtable) { + return std::all_of(Q_POS.begin(), Q_POS.end(), + [&](int pos) { return qtable->quantval[pos] != 0; }); +} + +// Determine whether smoothing should be applied during decompression +bool do_smoothing(j_decompress_ptr cinfo) { + jpeg_decomp_master* m = cinfo->master; + bool smoothing_useful = false; + + if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) { + return false; + } + auto coef_bits_latch = m->coef_bits_latch; + auto prev_coef_bits_latch = m->prev_coef_bits_latch; + + for (int ci = 0; ci < cinfo->num_components; ci++) { + jpeg_component_info* compptr = &cinfo->comp_info[ci]; + JQUANT_TBL* qtable = compptr->quant_table; + int* coef_bits = cinfo->coef_bits[ci]; + int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components]; + + // Return early if conditions for smoothing are not met + if (qtable == nullptr || !is_nonzero_quantizers(qtable) || + coef_bits[0] < 0) { + return false; + } + + coef_bits_latch[ci][0] = coef_bits[0]; + + for (int coefi = 1; coefi < SAVED_COEFS; coefi++) { + prev_coef_bits_latch[ci][coefi] = + cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1; + if (coef_bits[coefi] != 0) { + smoothing_useful = true; + } + coef_bits_latch[ci][coefi] = coef_bits[coefi]; + } + } + + return smoothing_useful; +} + +void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component, + size_t bx, int iy) { + const size_t imcu_row = cinfo->output_iMCU_row; + int16_t* scratch = cinfo->master->smoothing_scratch_; + std::vector<int> Q_VAL(SAVED_COEFS); + int* coef_bits; + + std::array<std::array<int, 5>, 5> dc_values; + auto& compinfo = cinfo->comp_info[component]; + const size_t by0 = imcu_row * compinfo.v_samp_factor; + const size_t by = by0 + iy; + + int prev_iy = by > 0 ? iy - 1 : 0; + int prev_prev_iy = by > 1 ? iy - 2 : prev_iy; + int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy; + int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy; + + const int16_t* cur_row = blocks[iy][bx]; + const int16_t* prev_row = blocks[prev_iy][bx]; + const int16_t* prev_prev_row = blocks[prev_prev_iy][bx]; + const int16_t* next_row = blocks[next_iy][bx]; + const int16_t* next_next_row = blocks[next_next_iy][bx]; + + int prev_block_ind = bx ? -DCTSIZE2 : 0; + int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind; + int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0; + int next_next_block_ind = + bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind; + + std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row, + next_row, next_next_row}; + std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0, + next_block_ind, next_next_block_ind}; + + memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0])); + + for (int r = 0; r < 5; ++r) { + for (int c = 0; c < 5; ++c) { + dc_values[r][c] = row_ptrs[r][block_inds[c]]; + } + } + // Get the correct coef_bits: In case of an incomplete scan, we use the + // prev coeficients. + if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) { + coef_bits = cinfo->master->prev_coef_bits_latch[component]; + } else { + coef_bits = cinfo->master->coef_bits_latch[component]; + } + + bool change_dc = true; + for (int i = 1; i < SAVED_COEFS; i++) { + if (coef_bits[i] != -1) { + change_dc = false; + break; + } + } + + JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no]; + for (size_t i = 0; i < 6; ++i) { + Q_VAL[i] = quanttbl->quantval[Q_POS[i]]; + } + if (change_dc) { + for (size_t i = 6; i < SAVED_COEFS; ++i) { + Q_VAL[i] = quanttbl->quantval[Q_POS[i]]; + } + } + auto calculate_dct_value = [&](int coef_index) { + int64_t num = 0; + int pred; + int Al; + // we use the symmetry of the smoothing matrices by transposing the 5x5 dc + // matrix in that case. + bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 || + coef_index == 9; + auto dc = [&](int i, int j) { + return swap_indices ? dc_values[j][i] : dc_values[i][j]; + }; + Al = coef_bits[coef_index]; + switch (coef_index) { + case 0: + // set the DC + num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) - + 2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) + + 6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) + + 152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) + + 6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) - + 2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) - + 2 * dc(4, 4)); + // special case: for the DC the dequantization is different + Al = 0; + break; + case 1: + case 2: + // set Q01 or Q10 + num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) - + 3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) + + 3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) - + 38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) + + 13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) - + dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4)) + : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) + + 7 * dc(2, 4))); + break; + case 3: + case 5: + // set Q02 or Q20 + num = (change_dc + ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) - + 5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) + + 2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2) + : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) + + 13 * dc(3, 2) - dc(4, 2))); + break; + case 4: + // set Q11 + num = + (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) - + 9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4) + : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) - + dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) - + dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3))); + break; + case 6: + case 9: + // set Q03 or Q30 + num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) - + dc(3, 3)); + break; + case 7: + case 8: + // set Q12 and Q21 + num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) - + dc(3, 3)); + break; + } + num = Q_VAL[0] * num; + if (num >= 0) { + pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8); + if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1; + } else { + pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8); + if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1; + pred = -pred; + } + return static_cast<int16_t>(pred); + }; + + int loop_end = change_dc ? SAVED_COEFS : 6; + for (int i = 1; i < loop_end; ++i) { + if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) { + scratch[Q_POS[i]] = calculate_dct_value(i); + } + } + if (change_dc) { + scratch[0] = calculate_dct_value(0); + } +} + +void PrepareForOutput(j_decompress_ptr cinfo) { + jpeg_decomp_master* m = cinfo->master; + size_t iMCU_width = cinfo->max_h_samp_factor * m->min_scaled_dct_size; + size_t output_stride = m->iMCU_cols_ * iMCU_width; + for (int c = 0; c < cinfo->num_components; ++c) { + const auto& comp = cinfo->comp_info[c]; + size_t cheight = comp.v_samp_factor * m->scaled_dct_size[c]; + m->raw_height_[c] = cinfo->total_iMCU_rows * cheight; + m->raw_output_[c].Allocate(cinfo, 3 * cheight, output_stride); + } + int num_all_components = + std::max(cinfo->out_color_components, cinfo->num_components); + for (int c = 0; c < num_all_components; ++c) { + m->render_output_[c].Allocate(cinfo, cinfo->max_v_samp_factor, + output_stride); + } + m->idct_scratch_ = Allocate<float>(cinfo, 5 * DCTSIZE2, JPOOL_IMAGE_ALIGNED); + m->upsample_scratch_ = Allocate<float>( + cinfo, output_stride + kPaddingLeft + kPaddingRight, JPOOL_IMAGE_ALIGNED); + size_t bytes_per_sample = jpegli_bytes_per_sample(m->output_data_type_); + size_t bytes_per_pixel = cinfo->out_color_components * bytes_per_sample; + size_t scratch_stride = RoundUpTo(output_stride, HWY_ALIGNMENT); + m->output_scratch_ = Allocate<uint8_t>( + cinfo, bytes_per_pixel * scratch_stride, JPOOL_IMAGE_ALIGNED); + m->smoothing_scratch_ = + Allocate<int16_t>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED); + bool smoothing = do_smoothing(cinfo); + m->apply_smoothing = smoothing && cinfo->do_block_smoothing; + size_t coeffs_per_block = cinfo->num_components * DCTSIZE2; + m->nonzeros_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED); + m->sumabs_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED); + memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0])); + memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0])); + memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_)); + m->biases_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED); + memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0])); + cinfo->output_iMCU_row = 0; + cinfo->output_scanline = 0; + const float kDequantScale = 1.0f / (8 * 255); + if (m->dequant_ == nullptr) { + m->dequant_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED); + memset(m->dequant_, 0, coeffs_per_block * sizeof(float)); + } + for (int c = 0; c < cinfo->num_components; c++) { + const auto& comp = cinfo->comp_info[c]; + JQUANT_TBL* table = comp.quant_table; + if (table == nullptr) continue; + for (size_t k = 0; k < DCTSIZE2; ++k) { + m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale; + } + } + ChooseInverseTransform(cinfo); + ChooseColorTransform(cinfo); +} + +void DecodeCurrentiMCURow(j_decompress_ptr cinfo) { + jpeg_decomp_master* m = cinfo->master; + const size_t imcu_row = cinfo->output_iMCU_row; + JBLOCKARRAY ba[kMaxComponents]; + for (int c = 0; c < cinfo->num_components; ++c) { + const jpeg_component_info* comp = &cinfo->comp_info[c]; + int by0 = imcu_row * comp->v_samp_factor; + int block_rows_left = comp->height_in_blocks - by0; + int max_block_rows = std::min(comp->v_samp_factor, block_rows_left); + int offset = m->streaming_mode_ ? 0 : by0; + ba[c] = (*cinfo->mem->access_virt_barray)( + reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset, + max_block_rows, false); + } + for (int c = 0; c < cinfo->num_components; ++c) { + size_t k0 = c * DCTSIZE2; + auto& compinfo = cinfo->comp_info[c]; + size_t block_row = imcu_row * compinfo.v_samp_factor; + if (ShouldApplyDequantBiases(cinfo, c)) { + // Update statistics for this iMCU row. + for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) { + size_t by = block_row + iy; + if (by >= compinfo.height_in_blocks) { + continue; + } + int16_t* JXL_RESTRICT coeffs = &ba[c][iy][0][0]; + size_t num = compinfo.width_in_blocks * DCTSIZE2; + GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]); + m->num_processed_blocks_[c] += compinfo.width_in_blocks; + } + if (imcu_row % 4 == 3) { + // Re-compute optimal biases every few iMCU-rows. + ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c], + &m->nonzeros_[k0], &m->sumabs_[k0], + &m->biases_[k0]); + } + } + RowBuffer<float>* raw_out = &m->raw_output_[c]; + for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) { + size_t by = block_row + iy; + if (by >= compinfo.height_in_blocks) { + continue; + } + size_t dctsize = m->scaled_dct_size[c]; + int16_t* JXL_RESTRICT row_in = &ba[c][iy][0][0]; + float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize); + for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) { + if (m->apply_smoothing) { + PredictSmooth(cinfo, ba[c], c, bx, iy); + (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0], + &m->biases_[k0], m->idct_scratch_, + &row_out[bx * dctsize], raw_out->stride(), + dctsize); + } else { + (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0], + &m->biases_[k0], m->idct_scratch_, + &row_out[bx * dctsize], raw_out->stride(), + dctsize); + } + } + if (m->streaming_mode_) { + memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK)); + } + } + } +} + +void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) { + jpegli::DecodeCurrentiMCURow(cinfo); + jpeg_decomp_master* m = cinfo->master; + for (int c = 0; c < cinfo->num_components; ++c) { + const auto& compinfo = cinfo->comp_info[c]; + size_t comp_width = compinfo.width_in_blocks * DCTSIZE; + size_t comp_height = compinfo.height_in_blocks * DCTSIZE; + size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE; + size_t y0 = cinfo->output_iMCU_row * compinfo.v_samp_factor * DCTSIZE; + size_t y1 = std::min(y0 + comp_nrows, comp_height); + for (size_t y = y0; y < y1; ++y) { + float* rows[1] = {m->raw_output_[c].Row(y)}; + uint8_t* output = data[c][y - y0]; + DecenterRow(rows[0], comp_width); + WriteToOutput(cinfo, rows, 0, comp_width, 1, output); + } + } + ++cinfo->output_iMCU_row; + cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE; + if (cinfo->output_scanline >= cinfo->output_height) { + ++m->output_passes_done_; + } +} + +void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows, + JSAMPARRAY scanlines, size_t max_output_rows) { + jpeg_decomp_master* m = cinfo->master; + const int vfactor = cinfo->max_v_samp_factor; + const int hfactor = cinfo->max_h_samp_factor; + const size_t imcu_row = cinfo->output_iMCU_row; + const size_t imcu_height = vfactor * m->min_scaled_dct_size; + const size_t imcu_width = hfactor * m->min_scaled_dct_size; + const size_t output_width = m->iMCU_cols_ * imcu_width; + if (imcu_row == cinfo->total_iMCU_rows || + (imcu_row > 1 && cinfo->output_scanline < (imcu_row - 1) * imcu_height)) { + // We are ready to output some scanlines. + size_t ybegin = cinfo->output_scanline; + size_t yend = + (imcu_row == cinfo->total_iMCU_rows ? cinfo->output_height + : (imcu_row - 1) * imcu_height); + yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows); + size_t yb = (ybegin / vfactor) * vfactor; + size_t ye = DivCeil(yend, vfactor) * vfactor; + for (size_t y = yb; y < ye; y += vfactor) { + for (int c = 0; c < cinfo->num_components; ++c) { + RowBuffer<float>* raw_out = &m->raw_output_[c]; + RowBuffer<float>* render_out = &m->render_output_[c]; + int line_groups = vfactor / m->v_factor[c]; + size_t yc = y / m->v_factor[c]; + for (int dy = 0; dy < line_groups; ++dy) { + if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) { + size_t ymid = yc + dy; + const float* JXL_RESTRICT row_mid = raw_out->Row(ymid); + const float* JXL_RESTRICT row_top = + ymid == 0 ? row_mid : raw_out->Row(ymid - 1); + const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c] + ? row_mid + : raw_out->Row(ymid + 1); + Upsample2Vertical(row_top, row_mid, row_bot, + render_out->Row(2 * dy), + render_out->Row(2 * dy + 1), output_width); + } else { + for (int yix = 0; yix < m->v_factor[c]; ++yix) { + size_t ymid = yc + dy; + memcpy(render_out->Row(m->v_factor[c] * dy + yix), + raw_out->Row(ymid), raw_out->xsize() * sizeof(float)); + } + } + } + } + for (int yix = 0; yix < vfactor; ++yix) { + if (y + yix < ybegin || y + yix >= yend) continue; + float* rows[kMaxComponents]; + int num_all_components = + std::max(cinfo->out_color_components, cinfo->num_components); + for (int c = 0; c < num_all_components; ++c) { + rows[c] = m->render_output_[c].Row(yix); + } + (*m->color_transform)(rows, output_width); + for (int c = 0; c < cinfo->out_color_components; ++c) { + // Undo the centering of the sample values around zero. + DecenterRow(rows[c], output_width); + } + if (scanlines) { + uint8_t* output = scanlines[*num_output_rows]; + WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width, + cinfo->out_color_components, output); + } + JXL_ASSERT(cinfo->output_scanline == y + yix); + ++cinfo->output_scanline; + ++(*num_output_rows); + if (cinfo->output_scanline == cinfo->output_height) { + ++m->output_passes_done_; + } + } + } + } else { + DecodeCurrentiMCURow(cinfo); + for (int c = 0; c < cinfo->num_components; ++c) { + if (m->h_factor[c] == 1) continue; + const auto& compinfo = cinfo->comp_info[c]; + RowBuffer<float>* raw_out = &m->raw_output_[c]; + size_t cheight = compinfo.v_samp_factor * m->scaled_dct_size[c]; + size_t y0 = imcu_row * cheight; + if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) { + for (size_t iy = 0; iy < cheight; ++iy) { + float* JXL_RESTRICT row = raw_out->Row(y0 + iy); + Upsample2Horizontal(row, m->upsample_scratch_, output_width); + } + } else { + for (size_t iy = 0; iy < cheight; ++iy) { + float* JXL_RESTRICT row = raw_out->Row(y0 + iy); + float* JXL_RESTRICT tmp = m->upsample_scratch_; + // TODO(szabadka) SIMDify this. + for (size_t x = 0; x < output_width; ++x) { + tmp[x] = row[x / m->h_factor[c]]; + } + memcpy(row, tmp, output_width * sizeof(tmp[0])); + } + } + } + ++cinfo->output_iMCU_row; + } +} + +} // namespace jpegli +#endif // HWY_ONCE |