From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc | 311 +++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc (limited to 'third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc') diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc new file mode 100644 index 0000000000..187095cf61 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc @@ -0,0 +1,311 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_noise.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc" +#include +#include + +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +using D = HWY_CAPPED(float, kBlockDim); +using DI = hwy::HWY_NAMESPACE::Rebind; +using DI8 = hwy::HWY_NAMESPACE::Repartition; + +// [0, max_value] +template +static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) { + const auto clamped = Min(x, max_value); + return ZeroIfNegative(clamped); +} + +// x is in [0+delta, 1+delta], delta ~= 0.06 +template +typename StrengthEval::V NoiseStrength(const StrengthEval& eval, + const typename StrengthEval::V x) { + return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f)); +} + +// TODO(veluca): SIMD-fy. +class StrengthEvalLut { + public: + using V = Vec; + + explicit StrengthEvalLut(const NoiseParams& noise_params) +#if HWY_TARGET == HWY_SCALAR + : noise_params_(noise_params) +#endif + { +#if HWY_TARGET != HWY_SCALAR + uint32_t lut[8]; + memcpy(lut, noise_params.lut, sizeof(lut)); + for (size_t i = 0; i < 8; i++) { + low16_lut[2 * i] = (lut[i] >> 0) & 0xFF; + low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF; + high16_lut[2 * i] = (lut[i] >> 16) & 0xFF; + high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF; + } +#endif + } + + V operator()(const V vx) const { + constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2; + auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale))); + auto floor_x = Floor(scaled_vx); + auto frac_x = Sub(scaled_vx, floor_x); + floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale), + floor_x); + frac_x = + IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x); + auto floor_x_int = ConvertTo(DI(), floor_x); +#if HWY_TARGET == HWY_SCALAR + auto low = Set(D(), noise_params_.lut[floor_x_int.raw]); + auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]); +#else + // Set each lane's bytes to {0, 0, 2x+1, 2x}. + auto floorx_indices_low = + Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100)); + // Set each lane's bytes to {2x+1, 2x, 0, 0}. + auto floorx_indices_hi = + Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000)); + // load LUT + auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut)); + auto lowm = Set(DI(), 0xFFFF); + auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut)); + auto him = Set(DI(), 0xFFFF0000); + // low = noise_params.lut[floor_x] + auto low = + BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm), + And(TableLookupBytes(hi16, floorx_indices_hi), him))); + // hi = noise_params.lut[floor_x+1] + floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202)); + floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000)); + auto hi = + BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm), + And(TableLookupBytes(hi16, floorx_indices_hi), him))); +#endif + return MulAdd(Sub(hi, low), frac_x, low); + } + + private: +#if HWY_TARGET != HWY_SCALAR + // noise_params.lut transformed into two 16-bit lookup tables. + HWY_ALIGN uint8_t high16_lut[16]; + HWY_ALIGN uint8_t low16_lut[16]; +#else + const NoiseParams& noise_params_; +#endif +}; + +template +void AddNoiseToRGB(const D d, const Vec rnd_noise_r, + const Vec rnd_noise_g, const Vec rnd_noise_cor, + const Vec noise_strength_g, const Vec noise_strength_r, + float ytox, float ytob, float* JXL_RESTRICT out_x, + float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) { + const auto kRGCorr = Set(d, 0.9921875f); // 127/128 + const auto kRGNCorr = Set(d, 0.0078125f); // 1/128 + + const auto red_noise = + Mul(noise_strength_r, + MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor))); + const auto green_noise = + Mul(noise_strength_g, + MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor))); + + auto vx = LoadU(d, out_x); + auto vy = LoadU(d, out_y); + auto vb = LoadU(d, out_b); + + const auto rg_noise = Add(red_noise, green_noise); + vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx); + vy = Add(vy, rg_noise); + vb = MulAdd(Set(d, ytob), rg_noise, vb); + + StoreU(vx, d, out_x); + StoreU(vy, d, out_y); + StoreU(vb, d, out_b); +} + +class AddNoiseStage : public RenderPipelineStage { + public: + AddNoiseStage(const NoiseParams& noise_params, + const ColorCorrelationMap& cmap, size_t first_c) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/0)), + noise_params_(noise_params), + cmap_(cmap), + first_c_(first_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Noise apply"); + + if (!noise_params_.HasAny()) return; + const StrengthEvalLut noise_model(noise_params_); + D d; + const auto half = Set(d, 0.5f); + + // With the prior subtract-random Laplacian approximation, rnd_* ranges were + // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the + // normalizer is half of what it was before (0.5). + const auto norm_const = Set(d, 0.22f); + + float ytox = cmap_.YtoXRatio(0); + float ytob = cmap_.YtoBRatio(0); + + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + + float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0); + const float* JXL_RESTRICT row_rnd_r = + GetInputRow(input_rows, first_c_ + 0, 0); + const float* JXL_RESTRICT row_rnd_g = + GetInputRow(input_rows, first_c_ + 1, 0); + const float* JXL_RESTRICT row_rnd_c = + GetInputRow(input_rows, first_c_ + 2, 0); + // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and + // shuffles are otherwise done on the data, so this is safe. + msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + for (size_t x = 0; x < xsize_v; x += Lanes(d)) { + const auto vx = LoadU(d, row_x + x); + const auto vy = LoadU(d, row_y + x); + const auto in_g = Sub(vy, vx); + const auto in_r = Add(vy, vx); + const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half)); + const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half)); + const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const); + const auto addit_rnd_noise_green = + Mul(LoadU(d, row_rnd_g + x), norm_const); + const auto addit_rnd_noise_correlated = + Mul(LoadU(d, row_rnd_c + x), norm_const); + AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green, + addit_rnd_noise_correlated, noise_strength_g, + noise_strength_r, ytox, ytob, row_x + x, row_y + x, + row_b + x); + } + msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c >= first_c_ ? RenderPipelineChannelMode::kInput + : c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "AddNoise"; } + + private: + const NoiseParams& noise_params_; + const ColorCorrelationMap& cmap_; + size_t first_c_; +}; + +std::unique_ptr GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start) { + return jxl::make_unique(noise_params, cmap, noise_c_start); +} + +class ConvolveNoiseStage : public RenderPipelineStage { + public: + explicit ConvolveNoiseStage(size_t first_c) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/2)), + first_c_(first_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Noise convolve"); + + const HWY_FULL(float) d; + for (size_t c = first_c_; c < first_c_ + 3; c++) { + float* JXL_RESTRICT rows[5]; + for (size_t i = 0; i < 5; i++) { + rows[i] = GetInputRow(input_rows, c, i - 2); + } + float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0); + for (ssize_t x = -RoundUpTo(xextra, Lanes(d)); + x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto p00 = LoadU(d, rows[2] + x); + auto others = Zero(d); + // TODO(eustas): sum loaded values to reduce the calculation chain + for (ssize_t i = -2; i <= 2; i++) { + others = Add(others, LoadU(d, rows[0] + x + i)); + others = Add(others, LoadU(d, rows[1] + x + i)); + others = Add(others, LoadU(d, rows[3] + x + i)); + others = Add(others, LoadU(d, rows[4] + x + i)); + } + others = Add(others, LoadU(d, rows[2] + x - 2)); + others = Add(others, LoadU(d, rows[2] + x - 1)); + others = Add(others, LoadU(d, rows[2] + x + 1)); + others = Add(others, LoadU(d, rows[2] + x + 2)); + // 4 * (1 - box kernel) + auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84))); + StoreU(pixels, d, row_out + x); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c >= first_c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ConvNoise"; } + + private: + size_t first_c_; +}; + +std::unique_ptr GetConvolveNoiseStage( + size_t noise_c_start) { + return jxl::make_unique(noise_c_start); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetAddNoiseStage); +HWY_EXPORT(GetConvolveNoiseStage); + +std::unique_ptr GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start) { + return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap, + noise_c_start); +} + +std::unique_ptr GetConvolveNoiseStage( + size_t noise_c_start) { + return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start); +} + +} // namespace jxl +#endif -- cgit v1.2.3