summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/render_pipeline
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/render_pipeline')
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc864
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h111
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc132
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h139
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h171
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc579
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc266
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h37
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc250
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h25
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc127
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h26
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc134
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h21
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc526
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h31
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc194
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h20
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc120
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h24
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc316
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h32
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc47
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h22
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc62
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h21
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc51
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h21
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc203
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h21
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc147
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h36
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc192
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h26
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc671
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h31
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc178
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h26
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc83
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h24
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h101
41 files changed, 6108 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
new file mode 100644
index 0000000000..9aefdd007d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
@@ -0,0 +1,864 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+
+#include <algorithm>
+#include <queue>
+#include <tuple>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+std::pair<size_t, size_t>
+LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions(
+ std::pair<size_t, size_t> in, size_t c, size_t stage) const {
+ std::pair<size_t, size_t> ret;
+ std::pair<size_t, size_t> shift = channel_shifts_[stage][c];
+ ret.first =
+ ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first;
+ ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >>
+ shift.second;
+ return ret;
+}
+
+std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore(
+ size_t c) const {
+ auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0);
+ ret.first += padding_[0][c].first;
+ ret.second += padding_[0][c].second;
+ return ret;
+}
+
+void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c,
+ const ImageF& in) {
+ size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t gx = group_id % frame_dimensions_.xsize_groups;
+ size_t hshift = channel_shifts_[0][c].first;
+ size_t vshift = channel_shifts_[0][c].second;
+ size_t x0 = gx * GroupInputXSize(c);
+ size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+ DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+ size_t y0 = gy * GroupInputYSize(c);
+ size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+ DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+ auto borders = BorderToStore(c);
+ size_t borderx_write = borders.first;
+ size_t bordery_write = borders.second;
+
+ if (gy > 0) {
+ Rect from(group_data_x_border_, group_data_y_border_, x1 - x0,
+ bordery_write);
+ Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write);
+ CopyImageTo(from, in, to, &borders_horizontal_[c]);
+ }
+ if (gy + 1 < frame_dimensions_.ysize_groups) {
+ Rect from(group_data_x_border_,
+ group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0,
+ bordery_write);
+ Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write);
+ CopyImageTo(from, in, to, &borders_horizontal_[c]);
+ }
+ if (gx > 0) {
+ Rect from(group_data_x_border_, group_data_y_border_, borderx_write,
+ y1 - y0);
+ Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0);
+ CopyImageTo(from, in, to, &borders_vertical_[c]);
+ }
+ if (gx + 1 < frame_dimensions_.xsize_groups) {
+ Rect from(group_data_x_border_ + x1 - x0 - borderx_write,
+ group_data_y_border_, borderx_write, y1 - y0);
+ Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0);
+ CopyImageTo(from, in, to, &borders_vertical_[c]);
+ }
+}
+
+void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c,
+ const Rect& r, ImageF* out) {
+ size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t gx = group_id % frame_dimensions_.xsize_groups;
+ size_t hshift = channel_shifts_[0][c].first;
+ size_t vshift = channel_shifts_[0][c].second;
+ // Coordinates of the group in the image.
+ size_t x0 = gx * GroupInputXSize(c);
+ size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+ DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+ size_t y0 = gy * GroupInputYSize(c);
+ size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+ DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+ size_t paddingx = padding_[0][c].first;
+ size_t paddingy = padding_[0][c].second;
+
+ auto borders = BorderToStore(c);
+ size_t borderx_write = borders.first;
+ size_t bordery_write = borders.second;
+
+ // Limits of the area to copy from, in image coordinates.
+ JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx);
+ size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift);
+ if (x0src != 0) {
+ x0src -= paddingx;
+ }
+ // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the
+ // right side of the image, so we use min() here.
+ size_t x1src =
+ DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift);
+ x1src = std::min(x1src + paddingx,
+ DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+
+ // Similar computation for y.
+ JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy);
+ size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift);
+ if (y0src != 0) {
+ y0src -= paddingy;
+ }
+ size_t y1src =
+ DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift);
+ y1src = std::min(y1src + paddingy,
+ DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+ // Copy other groups' borders from the border storage.
+ if (y0src < y0) {
+ JXL_DASSERT(gy > 0);
+ CopyImageTo(
+ Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write),
+ borders_horizontal_[c],
+ Rect(group_data_x_border_ + x0src - x0,
+ group_data_y_border_ - bordery_write, x1src - x0src,
+ bordery_write),
+ out);
+ }
+ if (y1src > y1) {
+ // When copying the bottom border we must not be on the bottom groups.
+ JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups);
+ CopyImageTo(
+ Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write),
+ borders_horizontal_[c],
+ Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0,
+ x1src - x0src, bordery_write),
+ out);
+ }
+ if (x0src < x0) {
+ JXL_DASSERT(gx > 0);
+ CopyImageTo(
+ Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src),
+ borders_vertical_[c],
+ Rect(group_data_x_border_ - borderx_write,
+ group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src),
+ out);
+ }
+ if (x1src > x1) {
+ // When copying the right border we must not be on the rightmost groups.
+ JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups);
+ CopyImageTo(
+ Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src),
+ borders_vertical_[c],
+ Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0,
+ borderx_write, y1src - y0src),
+ out);
+ }
+}
+
+size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const {
+ return (frame_dimensions_.group_dim << base_color_shift_) >>
+ channel_shifts_[0][c].first;
+}
+
+size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const {
+ return (frame_dimensions_.group_dim << base_color_shift_) >>
+ channel_shifts_[0][c].second;
+}
+
+void LowMemoryRenderPipeline::EnsureBordersStorage() {
+ const auto& shifts = channel_shifts_[0];
+ if (borders_horizontal_.size() < shifts.size()) {
+ borders_horizontal_.resize(shifts.size());
+ borders_vertical_.resize(shifts.size());
+ }
+ for (size_t c = 0; c < shifts.size(); c++) {
+ auto borders = BorderToStore(c);
+ size_t borderx = borders.first;
+ size_t bordery = borders.second;
+ JXL_DASSERT(frame_dimensions_.xsize_groups > 0);
+ size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2;
+ JXL_DASSERT(frame_dimensions_.ysize_groups > 0);
+ size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2;
+ size_t downsampled_xsize =
+ DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first);
+ size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded,
+ 1 << shifts[c].second);
+ Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders);
+ if (!SameSize(horizontal, borders_horizontal_[c])) {
+ borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize());
+ }
+ Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize);
+ if (!SameSize(vertical, borders_vertical_[c])) {
+ borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize());
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::Init() {
+ group_border_ = {0, 0};
+ base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+ frame_dimensions_.xsize_padded);
+
+ const auto& shifts = channel_shifts_[0];
+
+ // Ensure that each channel has enough many border pixels.
+ for (size_t c = 0; c < shifts.size(); c++) {
+ group_border_.first =
+ std::max(group_border_.first,
+ DivCeil(padding_[0][c].first << channel_shifts_[0][c].first,
+ 1 << base_color_shift_));
+ group_border_.second =
+ std::max(group_border_.second,
+ DivCeil(padding_[0][c].second << channel_shifts_[0][c].second,
+ 1 << base_color_shift_));
+ }
+
+ // Ensure that all channels have an integer number of border pixels in the
+ // input.
+ for (size_t c = 0; c < shifts.size(); c++) {
+ if (channel_shifts_[0][c].first >= base_color_shift_) {
+ group_border_.first =
+ RoundUpTo(group_border_.first,
+ 1 << (channel_shifts_[0][c].first - base_color_shift_));
+ }
+ if (channel_shifts_[0][c].second >= base_color_shift_) {
+ group_border_.second =
+ RoundUpTo(group_border_.second,
+ 1 << (channel_shifts_[0][c].second - base_color_shift_));
+ }
+ }
+ // Ensure that the X border on color channels is a multiple of kBlockDim or
+ // the vector size (required for EPF stages). Vectors on ARM NEON are never
+ // wider than 4 floats, so rounding to multiples of 4 is enough.
+#if JXL_ARCH_ARM
+ constexpr size_t kGroupXAlign = 4;
+#else
+ constexpr size_t kGroupXAlign = 16;
+#endif
+ group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign);
+ // Allocate borders in group images that are just enough for storing the
+ // borders to be copied in, plus any rounding to ensure alignment.
+ std::pair<size_t, size_t> max_border = {0, 0};
+ for (size_t c = 0; c < shifts.size(); c++) {
+ max_border.first = std::max(BorderToStore(c).first, max_border.first);
+ max_border.second = std::max(BorderToStore(c).second, max_border.second);
+ }
+ group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign);
+ group_data_y_border_ = max_border.second;
+
+ EnsureBordersStorage();
+ group_border_assigner_.Init(frame_dimensions_);
+
+ for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0;
+ first_trailing_stage_--) {
+ bool has_inout_c = false;
+ for (size_t c = 0; c < shifts.size(); c++) {
+ if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) ==
+ RenderPipelineChannelMode::kInOut) {
+ has_inout_c = true;
+ }
+ }
+ if (has_inout_c) {
+ break;
+ }
+ }
+
+ first_image_dim_stage_ = stages_.size();
+ for (size_t i = 0; i < stages_.size(); i++) {
+ std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ input_sizes[c] =
+ std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[i][c].first),
+ DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[i][c].second));
+ }
+ stages_[i]->SetInputSizes(input_sizes);
+ if (stages_[i]->SwitchToImageDimensions()) {
+ // We don't allow kInOut after switching to image dimensions.
+ JXL_ASSERT(i >= first_trailing_stage_);
+ first_image_dim_stage_ = i + 1;
+ stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_,
+ &frame_origin_);
+ break;
+ }
+ }
+ for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+ if (stages_[i]->SwitchToImageDimensions()) {
+ JXL_UNREACHABLE("Cannot switch to image dimensions multiple times");
+ }
+ std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ input_sizes[c] = {full_image_xsize_, full_image_ysize_};
+ }
+ stages_[i]->SetInputSizes(input_sizes);
+ }
+
+ anyc_.resize(stages_.size());
+ for (size_t i = 0; i < stages_.size(); i++) {
+ for (size_t c = 0; c < shifts.size(); c++) {
+ if (stages_[i]->GetChannelMode(c) !=
+ RenderPipelineChannelMode::kIgnored) {
+ anyc_[i] = c;
+ }
+ }
+ }
+
+ stage_input_for_channel_ = std::vector<std::vector<int32_t>>(
+ stages_.size(), std::vector<int32_t>(shifts.size()));
+ for (size_t c = 0; c < shifts.size(); c++) {
+ int input = -1;
+ for (size_t i = 0; i < stages_.size(); i++) {
+ stage_input_for_channel_[i][c] = input;
+ if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ input = i;
+ }
+ }
+ }
+
+ image_rect_.resize(stages_.size());
+ for (size_t i = 0; i < stages_.size(); i++) {
+ size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[i][anyc_[i]].first);
+ size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[i][anyc_[i]].second);
+ image_rect_[i] = Rect(0, 0, x1, y1);
+ }
+
+ virtual_ypadding_for_output_.resize(stages_.size());
+ xpadding_for_output_.resize(stages_.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ int ypad = 0;
+ int xpad = 0;
+ for (size_t i = stages_.size(); i-- > 0;) {
+ if (stages_[i]->GetChannelMode(c) !=
+ RenderPipelineChannelMode::kIgnored) {
+ virtual_ypadding_for_output_[i] =
+ std::max(ypad, virtual_ypadding_for_output_[i]);
+ xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]);
+ }
+ if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) +
+ stages_[i]->settings_.border_y)
+ << channel_shifts_[i][c].second;
+ xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) +
+ stages_[i]->settings_.border_x;
+ }
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
+ bool use_group_ids) {
+ const auto& shifts = channel_shifts_[0];
+ use_group_ids_ = use_group_ids;
+ size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
+ for (size_t t = group_data_.size(); t < num_buffers; t++) {
+ group_data_.emplace_back();
+ group_data_[t].resize(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2,
+ GroupInputYSize(c) + group_data_y_border_ * 2);
+ }
+ }
+ // TODO(veluca): avoid reallocating buffers if not needed.
+ stage_data_.resize(num);
+ size_t upsampling = 1u << base_color_shift_;
+ size_t group_dim = frame_dimensions_.group_dim * upsampling;
+ size_t padding =
+ 2 * group_data_x_border_ * upsampling + // maximum size of a rect
+ 2 * kRenderPipelineXOffset; // extra padding for processing
+ size_t stage_buffer_xsize = group_dim + padding;
+ for (size_t t = 0; t < num; t++) {
+ stage_data_[t].resize(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ stage_data_[t][c].resize(stages_.size());
+ size_t next_y_border = 0;
+ for (size_t i = stages_.size(); i-- > 0;) {
+ if (stages_[i]->GetChannelMode(c) ==
+ RenderPipelineChannelMode::kInOut) {
+ size_t stage_buffer_ysize =
+ 2 * next_y_border + (1 << stages_[i]->settings_.shift_y);
+ stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize);
+ next_y_border = stages_[i]->settings_.border_y;
+ stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize);
+ }
+ }
+ }
+ }
+ if (first_image_dim_stage_ != stages_.size()) {
+ RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+ frame_dimensions_.ysize_upsampled);
+ RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+ image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+ image_rect = image_rect.Intersection(full_image_rect);
+ if (image_rect.xsize() == 0 || image_rect.ysize() == 0) {
+ image_rect = RectT<ssize_t>(0, 0, 0, 0);
+ }
+ size_t left_padding = image_rect.x0();
+ size_t middle_padding = group_dim;
+ size_t right_padding = full_image_xsize_ - image_rect.x1();
+ size_t out_of_frame_xsize =
+ padding +
+ std::max(left_padding, std::max(middle_padding, right_padding));
+ out_of_frame_data_.resize(num);
+ for (size_t t = 0; t < num; t++) {
+ out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size());
+ }
+ }
+}
+
+std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers(
+ size_t group_id, size_t thread_id) {
+ std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size());
+ const size_t gx = group_id % frame_dimensions_.xsize_groups;
+ const size_t gy = group_id / frame_dimensions_.xsize_groups;
+ for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+ ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c];
+ ret[c].second = Rect(group_data_x_border_, group_data_y_border_,
+ GroupInputXSize(c), GroupInputYSize(c),
+ DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[0][c].first) -
+ gx * GroupInputXSize(c) + group_data_x_border_,
+ DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[0][c].second) -
+ gy * GroupInputYSize(c) + group_data_y_border_);
+ }
+ return ret;
+}
+
+namespace {
+
+JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) {
+ if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) {
+ return Mirror(y, image_ysize);
+ }
+ if (y + group_y0 >= image_ysize) {
+ // Here we know that the one mirroring step is sufficient.
+ return 2 * image_ysize - (y + group_y0) - 1 - group_y0;
+ }
+ return y;
+}
+
+JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0,
+ ssize_t group_xsize, ssize_t image_xsize) {
+ if (image_xsize <= borderx) {
+ if (group_x0 == 0) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset - ix - 1] =
+ row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)];
+ }
+ }
+ if (group_xsize + borderx + group_x0 >= image_xsize) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset + image_xsize + ix - group_x0] =
+ row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) -
+ group_x0];
+ }
+ }
+ } else {
+ // Here we know that the one mirroring step is sufficient.
+ if (group_x0 == 0) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix];
+ }
+ }
+ if (group_xsize + borderx + group_x0 >= image_xsize) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] =
+ row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1];
+ }
+ }
+ }
+}
+
+// Information about where the *output* of each stage is stored.
+class Rows {
+ public:
+ Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages,
+ const Rect data_max_color_channel_rect, int group_data_x_border,
+ int group_data_y_border,
+ const std::vector<std::pair<size_t, size_t>>& group_data_shift,
+ size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data,
+ std::vector<ImageF>& input_data) {
+ size_t num_stages = stages.size();
+ size_t num_channels = input_data.size();
+
+ JXL_ASSERT(thread_data.size() == num_channels);
+ JXL_ASSERT(group_data_shift.size() == num_channels);
+
+#if JXL_ENABLE_ASSERT
+ for (const auto& td : thread_data) {
+ JXL_ASSERT(td.size() == num_stages);
+ }
+#endif
+
+ rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels));
+
+ for (size_t i = 0; i < num_stages; i++) {
+ for (size_t c = 0; c < input_data.size(); c++) {
+ if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1;
+ rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0);
+ rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow();
+ }
+ }
+ }
+
+ for (size_t c = 0; c < input_data.size(); c++) {
+ auto channel_group_data_rect =
+ data_max_color_channel_rect.As<ssize_t>()
+ .Translate(-group_data_x_border, -group_data_y_border)
+ .ShiftLeft(base_color_shift)
+ .CeilShiftRight(group_data_shift[c])
+ .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset),
+ group_data_y_border);
+ rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0);
+ rows_[0][c].stride = input_data[c].PixelsPerRow();
+ rows_[0][c].ymod_minus_1 = -1;
+ }
+ }
+
+ // Stage -1 refers to the input data; all other values must be nonnegative and
+ // refer to the data for the output of that stage.
+ JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const {
+ JXL_DASSERT(stage >= -1);
+ const RowInfo& info = rows_[stage + 1][c];
+ return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1);
+ }
+
+ private:
+ struct RowInfo {
+ // Pointer to beginning of the first row.
+ float* base_ptr;
+ // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of
+ // 2, which allows efficient mod computation by masking).
+ int ymod_minus_1;
+ // Number of floats per row.
+ size_t stride;
+ };
+ std::vector<std::vector<RowInfo>> rows_;
+};
+
+} // namespace
+
+void LowMemoryRenderPipeline::RenderRect(size_t thread_id,
+ std::vector<ImageF>& input_data,
+ Rect data_max_color_channel_rect,
+ Rect image_max_color_channel_rect) {
+ // For each stage, the rect corresponding to the image area currently being
+ // processed, in the coordinates of that stage (i.e. with the scaling factor
+ // that that stage has).
+ std::vector<Rect> group_rect;
+ group_rect.resize(stages_.size());
+ Rect image_area_rect =
+ image_max_color_channel_rect.ShiftLeft(base_color_shift_)
+ .Crop(frame_dimensions_.xsize_upsampled,
+ frame_dimensions_.ysize_upsampled);
+ for (size_t i = 0; i < stages_.size(); i++) {
+ group_rect[i] =
+ image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]);
+ }
+
+ ssize_t frame_x0 =
+ first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0;
+ ssize_t frame_y0 =
+ first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0;
+ size_t full_image_xsize = first_image_dim_stage_ == stages_.size()
+ ? frame_dimensions_.xsize_upsampled
+ : full_image_xsize_;
+ size_t full_image_ysize = first_image_dim_stage_ == stages_.size()
+ ? frame_dimensions_.ysize_upsampled
+ : full_image_ysize_;
+
+ // Compute actual x-axis bounds for the current image area in the context of
+ // the full image this frame is part of. As the left boundary may be negative,
+ // we also create the x_pixels_skip value, defined as follows:
+ // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0;
+ // - full_image_x0 - x_pixels_skip is the position of the current frame area
+ // in the full image.
+ ssize_t full_image_x0 = frame_x0 + image_area_rect.x0();
+ ssize_t x_pixels_skip = 0;
+ if (full_image_x0 < 0) {
+ x_pixels_skip = -full_image_x0;
+ full_image_x0 = 0;
+ }
+ ssize_t full_image_x1 = frame_x0 + image_area_rect.x1();
+ full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize);
+
+ // If the current image area is entirely outside of the visible image, there
+ // is no point in proceeding. Note: this uses the assumption that if there is
+ // a stage with observable effects (i.e. a kInput stage), it only appears
+ // after the stage that switches to image dimensions.
+ if (full_image_x1 <= full_image_x0) return;
+
+ // Data structures to hold information about input/output rows and their
+ // buffers.
+ Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_,
+ group_data_y_border_, channel_shifts_[0], base_color_shift_,
+ stage_data_[thread_id], input_data);
+
+ std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ +
+ 1);
+ for (size_t i = 0; i < first_trailing_stage_; i++) {
+ input_rows[i].resize(input_data.size());
+ }
+ input_rows[first_trailing_stage_].resize(input_data.size(),
+ std::vector<float*>(1));
+
+ // Maximum possible shift is 3.
+ RenderPipelineStage::RowInfo output_rows(input_data.size(),
+ std::vector<float*>(8));
+
+ // Fills in input_rows and output_rows for a given y value (relative to the
+ // start of the group, measured in actual pixels at the appropriate vertical
+ // scaling factor) and a given stage, applying mirroring if necessary. This
+ // function is somewhat inefficient for trailing kInOut or kInput stages,
+ // where just filling the input row once ought to be sufficient.
+ auto prepare_io_rows = [&](int y, size_t i) {
+ ssize_t bordery = stages_[i]->settings_.border_y;
+ size_t shifty = stages_[i]->settings_.shift_y;
+ auto make_row = [&](size_t c, ssize_t iy) {
+ size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(),
+ image_rect_[i].ysize());
+ input_rows[i][c][iy] =
+ rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c);
+ ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x,
+ group_rect[i].x0(), group_rect[i].xsize(),
+ image_rect_[i].xsize());
+ };
+ for (size_t c = 0; c < input_data.size(); c++) {
+ RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c);
+ if (mode == RenderPipelineChannelMode::kIgnored) {
+ continue;
+ }
+ // If we already have rows from a previous iteration, we can just shift
+ // the rows by 1 and insert the new one.
+ if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) {
+ for (ssize_t iy = 0; iy < 2 * bordery; iy++) {
+ input_rows[i][c][iy] = input_rows[i][c][iy + 1];
+ }
+ make_row(c, bordery * 2);
+ } else {
+ input_rows[i][c].resize(2 * bordery + 1);
+ for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) {
+ make_row(c, iy);
+ }
+ }
+
+ // If necessary, get the output buffers.
+ if (mode == RenderPipelineChannelMode::kInOut) {
+ for (size_t iy = 0; iy < (1u << shifty); iy++) {
+ output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c);
+ }
+ }
+ }
+ };
+
+ // We pretend that every stage has a vertical shift of 0, i.e. it is as tall
+ // as the final image.
+ // We call each such row a "virtual" row, because it may or may not correspond
+ // to an actual row of the current processing stage; actual processing happens
+ // when vy % (1<<vshift) == 0.
+
+ int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(),
+ virtual_ypadding_for_output_.end());
+
+ for (int vy = -num_extra_rows;
+ vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) {
+ for (size_t i = 0; i < first_trailing_stage_; i++) {
+ int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i];
+
+ if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) {
+ continue;
+ }
+
+ if (stage_vy < -virtual_ypadding_for_output_[i]) {
+ continue;
+ }
+
+ int y = stage_vy >> channel_shifts_[i][anyc_[i]].second;
+
+ ssize_t image_y = ssize_t(group_rect[i].y0()) + y;
+ // Do not produce rows in out-of-bounds areas.
+ if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) {
+ continue;
+ }
+
+ // Get the input/output rows and potentially apply mirroring to the input.
+ prepare_io_rows(y, i);
+
+ // Produce output rows.
+ stages_[i]->ProcessRow(input_rows[i], output_rows,
+ xpadding_for_output_[i], group_rect[i].xsize(),
+ group_rect[i].x0(), image_y, thread_id);
+ }
+
+ // Process trailing stages, i.e. the final set of non-kInOut stages; they
+ // all have the same input buffer and no need to use any mirroring.
+
+ int y = vy - num_extra_rows;
+
+ for (size_t c = 0; c < input_data.size(); c++) {
+ // Skip pixels that are not part of the actual final image area.
+ input_rows[first_trailing_stage_][c][0] =
+ rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y,
+ c) +
+ x_pixels_skip;
+ }
+
+ // Check that we are not outside of the bounds for the current rendering
+ // rect. Not doing so might result in overwriting some rows that have been
+ // written (or will be written) by other threads.
+ if (y < 0 || y >= ssize_t(image_area_rect.ysize())) {
+ continue;
+ }
+
+ // Avoid running pipeline stages on pixels that are outside the full image
+ // area. As trailing stages have no borders, this is a free optimization
+ // (and may be necessary for correctness, as some stages assume coordinates
+ // are within bounds).
+ ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y;
+ if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) {
+ continue;
+ }
+
+ for (size_t i = first_trailing_stage_; i < stages_.size(); i++) {
+ // Before the first_image_dim_stage_, coordinates are relative to the
+ // current frame.
+ size_t x0 =
+ i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0;
+ size_t y =
+ i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y;
+ stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows,
+ /*xextra=*/0, full_image_x1 - full_image_x0, x0, y,
+ thread_id);
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) {
+ if (rect.xsize() == 0) return;
+ size_t numc = channel_shifts_[0].size();
+ RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1));
+ RenderPipelineStage::RowInfo output_rows;
+
+ for (size_t c = 0; c < numc; c++) {
+ input_rows[c][0] = out_of_frame_data_[thread_id].Row(c);
+ }
+
+ for (size_t y = 0; y < rect.ysize(); y++) {
+ stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow(
+ input_rows, rect.xsize(), rect.x0(), rect.y0() + y);
+ for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+ stages_[i]->ProcessRow(input_rows, output_rows,
+ /*xextra=*/0, rect.xsize(), rect.x0(),
+ rect.y0() + y, thread_id);
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id,
+ size_t thread_id) {
+ std::vector<ImageF>& input_data =
+ group_data_[use_group_ids_ ? group_id : thread_id];
+
+ // Copy the group borders to the border storage.
+ for (size_t c = 0; c < input_data.size(); c++) {
+ SaveBorders(group_id, c, input_data[c]);
+ }
+
+ size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t gx = group_id % frame_dimensions_.xsize_groups;
+
+ if (first_image_dim_stage_ != stages_.size()) {
+ size_t group_dim = frame_dimensions_.group_dim << base_color_shift_;
+ RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim,
+ group_dim);
+ RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+ frame_dimensions_.ysize_upsampled);
+ RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+ group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+ image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+ image_rect = image_rect.Intersection(full_image_rect);
+ group_rect = group_rect.Intersection(image_rect);
+ size_t x0 = group_rect.x0();
+ size_t y0 = group_rect.y0();
+ size_t x1 = group_rect.x1();
+ size_t y1 = group_rect.y1();
+ JXL_DEBUG_V(6,
+ "Rendering padding for full image rect %s "
+ "outside group rect %s",
+ Description(full_image_rect).c_str(),
+ Description(group_rect).c_str());
+
+ if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) {
+ // If this frame does not intersect with the full image, we have to
+ // initialize the whole image area with RenderPadding.
+ RenderPadding(thread_id,
+ Rect(0, 0, full_image_xsize_, full_image_ysize_));
+ }
+
+ // Render padding for groups that intersect with the full image. The case
+ // where no groups intersect was handled above.
+ if (group_rect.xsize() > 0 && group_rect.ysize() > 0) {
+ if (gx == 0 && gy == 0) {
+ RenderPadding(thread_id, Rect(0, 0, x0, y0));
+ }
+ if (gy == 0) {
+ RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0));
+ }
+ if (gx == 0) {
+ RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0));
+ }
+ if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) {
+ RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1));
+ }
+ if (gy + 1 == frame_dimensions_.ysize_groups) {
+ RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1));
+ }
+ if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) {
+ RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0));
+ }
+ if (gx + 1 == frame_dimensions_.xsize_groups) {
+ RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0));
+ }
+ if (gy + 1 == frame_dimensions_.ysize_groups &&
+ gx + 1 == frame_dimensions_.xsize_groups) {
+ RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1,
+ full_image_ysize_ - y1));
+ }
+ }
+ }
+
+ Rect ready_rects[GroupBorderAssigner::kMaxToFinalize];
+ size_t num_ready_rects = 0;
+ group_border_assigner_.GroupDone(group_id, group_border_.first,
+ group_border_.second, ready_rects,
+ &num_ready_rects);
+ for (size_t i = 0; i < num_ready_rects; i++) {
+ const Rect& image_max_color_channel_rect = ready_rects[i];
+ for (size_t c = 0; c < input_data.size(); c++) {
+ LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]);
+ }
+ Rect data_max_color_channel_rect(
+ group_data_x_border_ + image_max_color_channel_rect.x0() -
+ gx * frame_dimensions_.group_dim,
+ group_data_y_border_ + image_max_color_channel_rect.y0() -
+ gy * frame_dimensions_.group_dim,
+ image_max_color_channel_rect.xsize(),
+ image_max_color_channel_rect.ysize());
+ RenderRect(thread_id, input_data, data_max_color_channel_rect,
+ image_max_color_channel_rect);
+ }
+}
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
new file mode 100644
index 0000000000..b386f7c078
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A multithreaded, low-memory rendering pipeline that only allocates a minimal
+// amount of buffers.
+class LowMemoryRenderPipeline final : public RenderPipeline {
+ private:
+ std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+ size_t group_id, size_t thread_id) override;
+
+ void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+ void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+ void ClearDone(size_t i) override { group_border_assigner_.ClearDone(i); }
+
+ void Init() override;
+
+ void EnsureBordersStorage();
+ size_t GroupInputXSize(size_t c) const;
+ size_t GroupInputYSize(size_t c) const;
+ void RenderRect(size_t thread_id, std::vector<ImageF>& input_data,
+ Rect data_max_color_channel_rect,
+ Rect image_max_color_channel_rect);
+ void RenderPadding(size_t thread_id, Rect rect);
+
+ void SaveBorders(size_t group_id, size_t c, const ImageF& in);
+ void LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out);
+
+ std::pair<size_t, size_t> ColorDimensionsToChannelDimensions(
+ std::pair<size_t, size_t> in, size_t c, size_t stage) const;
+
+ std::pair<size_t, size_t> BorderToStore(size_t c) const;
+
+ bool use_group_ids_;
+
+ // Storage for borders between groups. Borders of adjacent groups are stacked
+ // together, e.g. bottom border of current group is followed by top border
+ // of next group.
+ std::vector<ImageF> borders_horizontal_;
+ std::vector<ImageF> borders_vertical_;
+
+ // Manages the status of borders.
+ GroupBorderAssigner group_border_assigner_;
+
+ // Size (in color-channel-pixels) of the border around each group that might
+ // be assigned to that group.
+ std::pair<size_t, size_t> group_border_;
+ // base_color_shift_ defines the size of groups in terms of final image
+ // pixels.
+ size_t base_color_shift_;
+
+ // Buffer for decoded pixel data for a group, indexed by [thread][channel] or
+ // [group][channel] depending on `use_group_ids_`.
+ std::vector<std::vector<ImageF>> group_data_;
+
+ // Borders for storing group data.
+ size_t group_data_x_border_;
+ size_t group_data_y_border_;
+
+ // Buffers for intermediate rows for the various stages, indexed by
+ // [thread][channel][stage].
+ std::vector<std::vector<std::vector<ImageF>>> stage_data_;
+
+ // Buffers for out-of-frame data, indexed by [thread]; every row is a
+ // different channel.
+ std::vector<ImageF> out_of_frame_data_;
+
+ // For each stage, a non-kIgnored channel.
+ std::vector<int32_t> anyc_;
+
+ // Size of the image at each stage.
+ std::vector<Rect> image_rect_;
+
+ // For each stage, for each channel, keep track of the kInOut stage that
+ // produced the input to that stage (which corresponds to the buffer index
+ // containing the data). -1 if data comes from the original input.
+ std::vector<std::vector<int32_t>> stage_input_for_channel_;
+
+ // Number of (virtual) extra rows that must be processed at each stage
+ // to produce sufficient output for future stages.
+ std::vector<int> virtual_ypadding_for_output_;
+
+ // Same thing for columns, except these are real columns and not virtual ones.
+ std::vector<int> xpadding_for_output_;
+
+ // First stage that doesn't have any kInOut channel.
+ size_t first_trailing_stage_;
+
+ // Origin and size of the frame after switching to image dimensions.
+ FrameOrigin frame_origin_;
+ size_t full_image_xsize_;
+ size_t full_image_ysize_;
+ size_t first_image_dim_stage_;
+};
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
new file mode 100644
index 0000000000..68b6ef613f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <algorithm>
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void RenderPipeline::Builder::AddStage(
+ std::unique_ptr<RenderPipelineStage> stage) {
+ stages_.push_back(std::move(stage));
+}
+
+std::unique_ptr<RenderPipeline> RenderPipeline::Builder::Finalize(
+ FrameDimensions frame_dimensions) && {
+#if JXL_ENABLE_ASSERT
+ // Check that the last stage is not an kInOut stage for any channel, and that
+ // there is at least one stage.
+ JXL_ASSERT(!stages_.empty());
+ for (size_t c = 0; c < num_c_; c++) {
+ JXL_ASSERT(stages_.back()->GetChannelMode(c) !=
+ RenderPipelineChannelMode::kInOut);
+ }
+#endif
+
+ std::unique_ptr<RenderPipeline> res;
+ if (use_simple_implementation_) {
+ res = jxl::make_unique<SimpleRenderPipeline>();
+ } else {
+ res = jxl::make_unique<LowMemoryRenderPipeline>();
+ }
+
+ res->padding_.resize(stages_.size());
+ for (size_t i = stages_.size(); i-- > 0;) {
+ const auto& stage = stages_[i];
+ res->padding_[i].resize(num_c_);
+ if (i + 1 == stages_.size()) {
+ continue;
+ }
+ for (size_t c = 0; c < num_c_; c++) {
+ if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ res->padding_[i][c].first = DivCeil(res->padding_[i + 1][c].first,
+ 1 << stage->settings_.shift_x) +
+ stage->settings_.border_x;
+ res->padding_[i][c].second = DivCeil(res->padding_[i + 1][c].second,
+ 1 << stage->settings_.shift_y) +
+ stage->settings_.border_y;
+ } else {
+ res->padding_[i][c] = res->padding_[i + 1][c];
+ }
+ }
+ }
+
+ res->frame_dimensions_ = frame_dimensions;
+ res->group_completed_passes_.resize(frame_dimensions.num_groups);
+ res->channel_shifts_.resize(stages_.size());
+ res->channel_shifts_[0].resize(num_c_);
+ for (size_t i = 1; i < stages_.size(); i++) {
+ auto& stage = stages_[i - 1];
+ for (size_t c = 0; c < num_c_; c++) {
+ if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ res->channel_shifts_[0][c].first += stage->settings_.shift_x;
+ res->channel_shifts_[0][c].second += stage->settings_.shift_y;
+ }
+ }
+ }
+ for (size_t i = 1; i < stages_.size(); i++) {
+ auto& stage = stages_[i - 1];
+ res->channel_shifts_[i].resize(num_c_);
+ for (size_t c = 0; c < num_c_; c++) {
+ if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ res->channel_shifts_[i][c].first =
+ res->channel_shifts_[i - 1][c].first - stage->settings_.shift_x;
+ res->channel_shifts_[i][c].second =
+ res->channel_shifts_[i - 1][c].second - stage->settings_.shift_y;
+ } else {
+ res->channel_shifts_[i][c].first = res->channel_shifts_[i - 1][c].first;
+ res->channel_shifts_[i][c].second =
+ res->channel_shifts_[i - 1][c].second;
+ }
+ }
+ }
+ res->stages_ = std::move(stages_);
+ res->Init();
+ return res;
+}
+
+RenderPipelineInput RenderPipeline::GetInputBuffers(size_t group_id,
+ size_t thread_id) {
+ RenderPipelineInput ret;
+ JXL_DASSERT(group_id < group_completed_passes_.size());
+ ret.group_id_ = group_id;
+ ret.thread_id_ = thread_id;
+ ret.pipeline_ = this;
+ ret.buffers_ = PrepareBuffers(group_id, thread_id);
+ return ret;
+}
+
+void RenderPipeline::InputReady(
+ size_t group_id, size_t thread_id,
+ const std::vector<std::pair<ImageF*, Rect>>& buffers) {
+ JXL_DASSERT(group_id < group_completed_passes_.size());
+ group_completed_passes_[group_id]++;
+ for (size_t i = 0; i < buffers.size(); ++i) {
+ (void)i;
+ JXL_CHECK_PLANE_INITIALIZED(*buffers[i].first, buffers[i].second, i);
+ }
+
+ ProcessBuffers(group_id, thread_id);
+}
+
+Status RenderPipeline::PrepareForThreads(size_t num, bool use_group_ids) {
+ for (const auto& stage : stages_) {
+ JXL_RETURN_IF_ERROR(stage->PrepareForThreads(num));
+ }
+ PrepareForThreadsInternal(num, use_group_ids);
+ return true;
+}
+
+void RenderPipelineInput::Done() {
+ JXL_ASSERT(pipeline_);
+ pipeline_->InputReady(group_id_, thread_id_, buffers_);
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
new file mode 100644
index 0000000000..bf3ad4975e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
@@ -0,0 +1,139 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Interface to provide input to the rendering pipeline. When this object is
+// destroyed, all the data in the provided ImageF's Rects must have been
+// initialized.
+class RenderPipelineInput {
+ public:
+ RenderPipelineInput(const RenderPipelineInput&) = delete;
+ RenderPipelineInput(RenderPipelineInput&& other) noexcept {
+ *this = std::move(other);
+ }
+ RenderPipelineInput& operator=(RenderPipelineInput&& other) noexcept {
+ pipeline_ = other.pipeline_;
+ group_id_ = other.group_id_;
+ thread_id_ = other.thread_id_;
+ buffers_ = std::move(other.buffers_);
+ other.pipeline_ = nullptr;
+ return *this;
+ }
+
+ RenderPipelineInput() = default;
+ void Done();
+
+ const std::pair<ImageF*, Rect>& GetBuffer(size_t c) const {
+ JXL_ASSERT(c < buffers_.size());
+ return buffers_[c];
+ }
+
+ private:
+ RenderPipeline* pipeline_ = nullptr;
+ size_t group_id_;
+ size_t thread_id_;
+ std::vector<std::pair<ImageF*, Rect>> buffers_;
+ friend class RenderPipeline;
+};
+
+class RenderPipeline {
+ public:
+ class Builder {
+ public:
+ explicit Builder(size_t num_c) : num_c_(num_c) { JXL_ASSERT(num_c > 0); }
+
+ // Adds a stage to the pipeline. Must be called at least once; the last
+ // added stage cannot have kInOut channels.
+ void AddStage(std::unique_ptr<RenderPipelineStage> stage);
+
+ // Enables using the simple (i.e. non-memory-efficient) implementation of
+ // the pipeline.
+ void UseSimpleImplementation() { use_simple_implementation_ = true; }
+
+ // Finalizes setup of the pipeline. Shifts for all channels should be 0 at
+ // this point.
+ std::unique_ptr<RenderPipeline> Finalize(
+ FrameDimensions frame_dimensions) &&;
+
+ private:
+ std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+ size_t num_c_;
+ bool use_simple_implementation_ = false;
+ };
+
+ friend class Builder;
+
+ virtual ~RenderPipeline() = default;
+
+ Status IsInitialized() const {
+ for (const auto& stage : stages_) {
+ JXL_RETURN_IF_ERROR(stage->IsInitialized());
+ }
+ return true;
+ }
+
+ // Allocates storage to run with `num` threads. If `use_group_ids` is true,
+ // storage is allocated for each group, not each thread. The behaviour is
+ // undefined if calling this function multiple times with a different value
+ // for `use_group_ids`.
+ Status PrepareForThreads(size_t num, bool use_group_ids);
+
+ // Retrieves a buffer where input data should be stored by the callee. When
+ // input has been provided for all buffers, the pipeline will complete its
+ // processing. This method may be called multiple times concurrently from
+ // different threads, provided that a different `thread_id` is given.
+ RenderPipelineInput GetInputBuffers(size_t group_id, size_t thread_id);
+
+ size_t PassesWithAllInput() const {
+ return *std::min_element(group_completed_passes_.begin(),
+ group_completed_passes_.end());
+ }
+
+ virtual void ClearDone(size_t i) {}
+
+ protected:
+ std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+ // Shifts for every channel at the input of each stage.
+ std::vector<std::vector<std::pair<size_t, size_t>>> channel_shifts_;
+
+ // Amount of (cumulative) padding required by each stage and channel, in
+ // either direction.
+ std::vector<std::vector<std::pair<size_t, size_t>>> padding_;
+
+ FrameDimensions frame_dimensions_;
+
+ std::vector<uint8_t> group_completed_passes_;
+
+ friend class RenderPipelineInput;
+
+ private:
+ void InputReady(size_t group_id, size_t thread_id,
+ const std::vector<std::pair<ImageF*, Rect>>& buffers);
+
+ virtual std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+ size_t group_id, size_t thread_id) = 0;
+
+ virtual void ProcessBuffers(size_t group_id, size_t thread_id) = 0;
+
+ // Note that this method may be called multiple times with different (or
+ // equal) `num`.
+ virtual void PrepareForThreadsInternal(size_t num, bool use_group_ids) = 0;
+
+ // Called once frame dimensions and stages are known.
+ virtual void Init() {}
+};
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
new file mode 100644
index 0000000000..d1a0074161
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
@@ -0,0 +1,171 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/frame_header.h"
+
+namespace jxl {
+
+// The first pixel in the input to RenderPipelineStage will be located at
+// this position. Pixels before this position may be accessed as padding.
+// This should be at least the RoundUpTo(maximum padding / 2, maximum vector
+// size) times 2: this is realized when using Gaborish + EPF + upsampling +
+// chroma subsampling.
+#if JXL_ARCH_ARM
+constexpr size_t kRenderPipelineXOffset = 16;
+#else
+constexpr size_t kRenderPipelineXOffset = 32;
+#endif
+
+enum class RenderPipelineChannelMode {
+ // This channel is not modified by this stage.
+ kIgnored = 0,
+ // This channel is modified in-place.
+ kInPlace = 1,
+ // This channel is modified and written to a new buffer.
+ kInOut = 2,
+ // This channel is only read. These are the only stages that are assumed to
+ // have observable effects, i.e. calls to ProcessRow for other stages may be
+ // omitted if it can be shown they can't affect any kInput stage ProcessRow
+ // call that happens inside image boundaries.
+ kInput = 3,
+};
+
+class RenderPipeline;
+
+class RenderPipelineStage {
+ protected:
+ using Row = float*;
+ using ChannelRows = std::vector<Row>;
+
+ public:
+ using RowInfo = std::vector<ChannelRows>;
+ struct Settings {
+ // Amount of padding required in the various directions by all channels
+ // that have kInOut mode.
+ size_t border_x = 0;
+ size_t border_y = 0;
+
+ // Log2 of the number of columns/rows of output that this stage will produce
+ // for every input row for kInOut channels.
+ size_t shift_x = 0;
+ size_t shift_y = 0;
+
+ static Settings ShiftX(size_t shift, size_t border) {
+ Settings settings;
+ settings.border_x = border;
+ settings.shift_x = shift;
+ return settings;
+ }
+
+ static Settings ShiftY(size_t shift, size_t border) {
+ Settings settings;
+ settings.border_y = border;
+ settings.shift_y = shift;
+ return settings;
+ }
+
+ static Settings Symmetric(size_t shift, size_t border) {
+ Settings settings;
+ settings.border_x = settings.border_y = border;
+ settings.shift_x = settings.shift_y = shift;
+ return settings;
+ }
+
+ static Settings SymmetricBorderOnly(size_t border) {
+ return Symmetric(0, border);
+ }
+ };
+
+ virtual ~RenderPipelineStage() = default;
+
+ // Processes one row of input, producing the appropriate number of rows of
+ // output. Input/output rows can be obtained by calls to
+ // `GetInputRow`/`GetOutputRow`. `xsize+2*xextra` represents the total number
+ // of pixels to be processed in the input row, where the first pixel is at
+ // position `kRenderPipelineXOffset-xextra`. All pixels in the
+ // `[kRenderPipelineXOffset-xextra-border_x,
+ // kRenderPipelineXOffset+xsize+xextra+border_x)` range are initialized and
+ // accessible. `xpos` and `ypos` represent the position of the first
+ // (non-extra, i.e. in position kRenderPipelineXOffset) pixel in the center
+ // row of the input in the full image. `xpos` is a multiple of
+ // `GroupBorderAssigner::kPaddingXRound`. If `settings_.temp_buffer_size` is
+ // nonzero, `temp` will point to an HWY-aligned buffer of at least that number
+ // of floats; concurrent calls will have different buffers.
+ virtual void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const = 0;
+
+ // How each channel will be processed. Channels are numbered starting from
+ // color channels (always 3) and followed by all other channels.
+ virtual RenderPipelineChannelMode GetChannelMode(size_t c) const = 0;
+
+ protected:
+ explicit RenderPipelineStage(Settings settings) : settings_(settings) {}
+
+ virtual Status IsInitialized() const { return true; }
+
+ // Informs the stage about the total size of each channel. Few stages will
+ // actually need to use this information.
+ virtual void SetInputSizes(
+ const std::vector<std::pair<size_t, size_t>>& input_sizes) {}
+
+ virtual Status PrepareForThreads(size_t num_threads) { return true; }
+
+ // Returns a pointer to the input row of channel `c` with offset `y`.
+ // `y` must be in [-settings_.border_y, settings_.border_y]. `c` must be such
+ // that `GetChannelMode(c) != kIgnored`. The returned pointer points to the
+ // offset-ed row (i.e. kRenderPipelineXOffset has been applied).
+ float* GetInputRow(const RowInfo& input_rows, size_t c, int offset) const {
+ JXL_DASSERT(GetChannelMode(c) != RenderPipelineChannelMode::kIgnored);
+ JXL_DASSERT(-offset <= static_cast<int>(settings_.border_y));
+ JXL_DASSERT(offset <= static_cast<int>(settings_.border_y));
+ return input_rows[c][settings_.border_y + offset] + kRenderPipelineXOffset;
+ }
+ // Similar to `GetInputRow`, but can only be used if `GetChannelMode(c) ==
+ // kInOut`. Offset must be less than `1<<settings_.shift_y`.. The returned
+ // pointer points to the offset-ed row (i.e. kRenderPipelineXOffset has been
+ // applied).
+ float* GetOutputRow(const RowInfo& output_rows, size_t c,
+ size_t offset) const {
+ JXL_DASSERT(GetChannelMode(c) == RenderPipelineChannelMode::kInOut);
+ JXL_DASSERT(offset <= 1ul << settings_.shift_y);
+ return output_rows[c][offset] + kRenderPipelineXOffset;
+ }
+
+ // Indicates whether, from this stage on, the pipeline will operate on an
+ // image- rather than frame-sized buffer. Only one stage in the pipeline
+ // should return true, and it should implement ProcessPaddingRow below too.
+ // It is assumed that, if there is a SwitchToImageDimensions() == true stage,
+ // all kInput stages appear after it.
+ virtual bool SwitchToImageDimensions() const { return false; }
+
+ // If SwitchToImageDimensions returns true, then this should set xsize and
+ // ysize to the image size, and frame_origin to the location of the frame
+ // within the image. Otherwise, this is not called at all.
+ virtual void GetImageDimensions(size_t* xsize, size_t* ysize,
+ FrameOrigin* frame_origin) const {}
+
+ // Produces the appropriate output data outside of the frame dimensions. xpos
+ // and ypos are now relative to the full image.
+ virtual void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize,
+ size_t xpos, size_t ypos) const {}
+
+ virtual const char* GetName() const = 0;
+
+ Settings settings_;
+ friend class RenderPipeline;
+ friend class SimpleRenderPipeline;
+ friend class LowMemoryRenderPipeline;
+};
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
new file mode 100644
index 0000000000..51b9f273f8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
@@ -0,0 +1,579 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <jxl/cms.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstdint>
+#include <cstdio>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h" // JXL_HIGH_PRECISION, JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/render_pipeline/test_render_pipeline_stages.h"
+#include "lib/jxl/splines.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+Status DecodeFile(const Span<const uint8_t> file, bool use_slow_pipeline,
+ CodecInOut* io, ThreadPool* pool) {
+ Status ret = true;
+ {
+ BitReader reader(file);
+ BitReaderScopedCloser reader_closer(&reader, &ret);
+ JXL_RETURN_IF_ERROR(reader.ReadFixedBits<16>() == 0x0AFF);
+ JXL_RETURN_IF_ERROR(ReadSizeHeader(&reader, &io->metadata.size));
+ JXL_RETURN_IF_ERROR(ReadImageMetadata(&reader, &io->metadata.m));
+ io->metadata.transform_data.nonserialized_xyb_encoded =
+ io->metadata.m.xyb_encoded;
+ JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data));
+ if (io->metadata.m.color_encoding.WantICC()) {
+ std::vector<uint8_t> icc;
+ JXL_RETURN_IF_ERROR(test::ReadICC(&reader, &icc));
+ JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(
+ std::move(icc), JxlGetDefaultCms()));
+ }
+ PassesDecoderState dec_state;
+ JXL_RETURN_IF_ERROR(
+ dec_state.output_encoding_info.SetFromMetadata(io->metadata));
+ JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary());
+ io->frames.clear();
+ FrameHeader frame_header(&io->metadata);
+ do {
+ io->frames.emplace_back(&io->metadata.m);
+ // Skip frames that are not displayed.
+ do {
+ size_t frame_start = reader.TotalBitsConsumed() / kBitsPerByte;
+ size_t size_left = file.size() - frame_start;
+ JXL_RETURN_IF_ERROR(DecodeFrame(&dec_state, pool,
+ file.data() + frame_start, size_left,
+ &frame_header, &io->frames.back(),
+ io->metadata, use_slow_pipeline));
+ reader.SkipBits(io->frames.back().decoded_bytes() * kBitsPerByte);
+ } while (frame_header.frame_type != FrameType::kRegularFrame &&
+ frame_header.frame_type != FrameType::kSkipProgressive);
+ } while (!frame_header.is_last);
+
+ if (io->frames.empty()) return JXL_FAILURE("Not enough data.");
+
+ if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) {
+ return JXL_FAILURE("Reader position not at EOF.");
+ }
+ if (!reader.AllReadsWithinBounds()) {
+ return JXL_FAILURE("Reader out of bounds read.");
+ }
+ io->CheckMetadata();
+ // reader is closed here.
+ }
+ return ret;
+}
+
+TEST(RenderPipelineTest, Build) {
+ RenderPipeline::Builder builder(/*num_c=*/1);
+ builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+ builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+ builder.AddStage(jxl::make_unique<Check0FinalStage>());
+ builder.UseSimpleImplementation();
+ FrameDimensions frame_dimensions;
+ frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+ /*max_hshift=*/0, /*max_vshift=*/0,
+ /*modular_mode=*/false, /*upsampling=*/1);
+ std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroups) {
+ RenderPipeline::Builder builder(/*num_c=*/1);
+ builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+ builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+ builder.AddStage(jxl::make_unique<Check0FinalStage>());
+ builder.UseSimpleImplementation();
+ FrameDimensions frame_dimensions;
+ frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+ /*max_hshift=*/0, /*max_vshift=*/0,
+ /*modular_mode=*/false, /*upsampling=*/1);
+ auto pipeline = std::move(builder).Finalize(frame_dimensions);
+ ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+ for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+ auto input_buffers = pipeline->GetInputBuffers(i, 0);
+ FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+ input_buffers.GetBuffer(0).second);
+ input_buffers.Done();
+ }
+
+ EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+TEST(RenderPipelineTest, BuildFast) {
+ RenderPipeline::Builder builder(/*num_c=*/1);
+ builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+ builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+ builder.AddStage(jxl::make_unique<Check0FinalStage>());
+ FrameDimensions frame_dimensions;
+ frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+ /*max_hshift=*/0, /*max_vshift=*/0,
+ /*modular_mode=*/false, /*upsampling=*/1);
+ std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroupsFast) {
+ RenderPipeline::Builder builder(/*num_c=*/1);
+ builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+ builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+ builder.AddStage(jxl::make_unique<Check0FinalStage>());
+ builder.UseSimpleImplementation();
+ FrameDimensions frame_dimensions;
+ frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+ /*max_hshift=*/0, /*max_vshift=*/0,
+ /*modular_mode=*/false, /*upsampling=*/1);
+ auto pipeline = std::move(builder).Finalize(frame_dimensions);
+ ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+ for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+ auto input_buffers = pipeline->GetInputBuffers(i, 0);
+ FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+ input_buffers.GetBuffer(0).second);
+ input_buffers.Done();
+ }
+
+ EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+struct RenderPipelineTestInputSettings {
+ // Input image.
+ std::string input_path;
+ size_t xsize, ysize;
+ bool jpeg_transcode = false;
+ // Encoding settings.
+ CompressParams cparams;
+ // Short name for the encoder settings.
+ std::string cparams_descr;
+
+ bool add_spot_color = false;
+
+ Splines splines;
+};
+
+class RenderPipelineTestParam
+ : public ::testing::TestWithParam<RenderPipelineTestInputSettings> {};
+
+TEST_P(RenderPipelineTestParam, PipelineTest) {
+ RenderPipelineTestInputSettings config = GetParam();
+
+ // Use a parallel runner that randomly shuffles tasks to detect possible
+ // border handling bugs.
+ FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+ ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+ const std::vector<uint8_t> orig = jxl::test::ReadTestData(config.input_path);
+
+ CodecInOut io;
+ if (config.jpeg_transcode) {
+ ASSERT_TRUE(jpeg::DecodeImageJPG(Bytes(orig), &io));
+ } else {
+ ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
+ }
+ io.ShrinkTo(config.xsize, config.ysize);
+
+ if (config.add_spot_color) {
+ jxl::ImageF spot(config.xsize, config.ysize);
+ jxl::ZeroFillImage(&spot);
+
+ for (size_t y = 0; y < config.ysize; y++) {
+ float* JXL_RESTRICT row = spot.Row(y);
+ for (size_t x = 0; x < config.xsize; x++) {
+ row[x] = ((x ^ y) & 255) * (1.f / 255.f);
+ }
+ }
+ ExtraChannelInfo info;
+ info.bit_depth.bits_per_sample = 8;
+ info.dim_shift = 0;
+ info.type = jxl::ExtraChannel::kSpotColor;
+ info.spot_color[0] = 0.5f;
+ info.spot_color[1] = 0.2f;
+ info.spot_color[2] = 1.f;
+ info.spot_color[3] = 0.5f;
+
+ io.metadata.m.extra_channel_info.push_back(info);
+ std::vector<jxl::ImageF> ec;
+ ec.push_back(std::move(spot));
+ io.frames[0].SetExtraChannels(std::move(ec));
+ }
+
+ std::vector<uint8_t> compressed;
+
+ config.cparams.custom_splines = config.splines;
+ ASSERT_TRUE(test::EncodeFile(config.cparams, &io, &compressed, &pool));
+
+ CodecInOut io_default;
+ ASSERT_TRUE(DecodeFile(Bytes(compressed),
+ /*use_slow_pipeline=*/false, &io_default, &pool));
+ CodecInOut io_slow_pipeline;
+ ASSERT_TRUE(DecodeFile(Bytes(compressed),
+ /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+ ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+ for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+ constexpr float kMaxError = 5e-5;
+#else
+ constexpr float kMaxError = 5e-4;
+#endif
+ Image3F def = std::move(*io_default.frames[i].color());
+ Image3F pip = std::move(*io_slow_pipeline.frames[i].color());
+ JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _));
+ for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+ ec++) {
+ JXL_ASSERT_OK(VerifyRelativeError(
+ io_slow_pipeline.frames[i].extra_channels()[ec],
+ io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+ }
+ }
+}
+
+Splines CreateTestSplines() {
+ const ColorCorrelationMap cmap;
+ std::vector<Spline::Point> control_points{{9, 54}, {118, 159}, {97, 3},
+ {10, 40}, {150, 25}, {120, 300}};
+ const Spline spline{
+ control_points,
+ /*color_dct=*/
+ {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}},
+ /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}};
+ std::vector<Spline> spline_data = {spline};
+ std::vector<QuantizedSpline> quantized_splines;
+ std::vector<Spline::Point> starting_points;
+ for (const Spline& spline : spline_data) {
+ quantized_splines.emplace_back(spline, /*quantization_adjustment=*/0,
+ cmap.YtoXRatio(0), cmap.YtoBRatio(0));
+ starting_points.push_back(spline.control_points.front());
+ }
+ return Splines(/*quantization_adjustment=*/0, std::move(quantized_splines),
+ std::move(starting_points));
+}
+
+std::vector<RenderPipelineTestInputSettings> GeneratePipelineTests() {
+ std::vector<RenderPipelineTestInputSettings> all_tests;
+
+ std::pair<size_t, size_t> sizes[] = {
+ {3, 8}, {128, 128}, {256, 256}, {258, 258}, {533, 401}, {777, 777},
+ };
+
+ for (auto size : sizes) {
+ RenderPipelineTestInputSettings settings;
+ settings.input_path = "jxl/flower/flower.png";
+ settings.xsize = size.first;
+ settings.ysize = size.second;
+
+ // Base settings.
+ settings.cparams.butteraugli_distance = 1.0;
+ settings.cparams.patches = Override::kOff;
+ settings.cparams.dots = Override::kOff;
+ settings.cparams.gaborish = Override::kOff;
+ settings.cparams.epf = 0;
+ settings.cparams.color_transform = ColorTransform::kXYB;
+
+ {
+ auto s = settings;
+ s.cparams_descr = "NoGabNoEpfNoPatches";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.color_transform = ColorTransform::kNone;
+ s.cparams_descr = "NoGabNoEpfNoPatchesNoXYB";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.gaborish = Override::kOn;
+ s.cparams_descr = "GabNoEpfNoPatches";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.epf = 1;
+ s.cparams_descr = "NoGabEpf1NoPatches";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.epf = 2;
+ s.cparams_descr = "NoGabEpf2NoPatches";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.epf = 3;
+ s.cparams_descr = "NoGabEpf3NoPatches";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.gaborish = Override::kOn;
+ s.cparams.epf = 3;
+ s.cparams_descr = "GabEpf3NoPatches";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "Splines";
+ s.splines = CreateTestSplines();
+ all_tests.push_back(s);
+ }
+
+ for (size_t ups : {2, 4, 8}) {
+ {
+ auto s = settings;
+ s.cparams.resampling = ups;
+ s.cparams_descr = "Ups" + std::to_string(ups);
+ all_tests.push_back(s);
+ }
+ {
+ auto s = settings;
+ s.cparams.resampling = ups;
+ s.cparams.epf = 1;
+ s.cparams_descr = "Ups" + std::to_string(ups) + "EPF1";
+ all_tests.push_back(s);
+ }
+ {
+ auto s = settings;
+ s.cparams.resampling = ups;
+ s.cparams.gaborish = Override::kOn;
+ s.cparams.epf = 1;
+ s.cparams_descr = "Ups" + std::to_string(ups) + "GabEPF1";
+ all_tests.push_back(s);
+ }
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "Noise";
+ s.cparams.photon_noise_iso = 3200;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "NoiseUps";
+ s.cparams.photon_noise_iso = 3200;
+ s.cparams.resampling = 2;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "ModularLossless";
+ s.cparams.modular_mode = true;
+ s.cparams.butteraugli_distance = 0;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "ProgressiveDC";
+ s.cparams.progressive_dc = 1;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "ModularLossy";
+ s.cparams.modular_mode = true;
+ s.cparams.butteraugli_distance = 1.f;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.input_path = "jxl/flower/flower_alpha.png";
+ s.cparams_descr = "AlphaVarDCT";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.input_path = "jxl/flower/flower_alpha.png";
+ s.cparams_descr = "AlphaVarDCTUpsamplingEPF";
+ s.cparams.epf = 1;
+ s.cparams.ec_resampling = 2;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams.modular_mode = true;
+ s.cparams.butteraugli_distance = 0;
+ s.input_path = "jxl/flower/flower_alpha.png";
+ s.cparams_descr = "AlphaLossless";
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.input_path = "jxl/flower/flower_alpha.png";
+ s.cparams_descr = "AlphaDownsample";
+ s.cparams.ec_resampling = 2;
+ all_tests.push_back(s);
+ }
+
+ {
+ auto s = settings;
+ s.cparams_descr = "SpotColor";
+ s.add_spot_color = true;
+ all_tests.push_back(s);
+ }
+ }
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+ for (const char* input : {"jxl/flower/flower.png.im_q85_444.jpg",
+ "jxl/flower/flower.png.im_q85_420.jpg",
+ "jxl/flower/flower.png.im_q85_422.jpg",
+ "jxl/flower/flower.png.im_q85_440.jpg"}) {
+ RenderPipelineTestInputSettings settings;
+ settings.input_path = input;
+ settings.jpeg_transcode = true;
+ settings.xsize = 2268;
+ settings.ysize = 1512;
+ settings.cparams_descr = "Default";
+ all_tests.push_back(settings);
+ }
+
+#endif
+
+ {
+ RenderPipelineTestInputSettings settings;
+ settings.input_path = "jxl/grayscale_patches.png";
+ settings.xsize = 1011;
+ settings.ysize = 277;
+ settings.cparams_descr = "Patches";
+ all_tests.push_back(settings);
+ }
+
+ {
+ RenderPipelineTestInputSettings settings;
+ settings.input_path = "jxl/grayscale_patches.png";
+ settings.xsize = 1011;
+ settings.ysize = 277;
+ settings.cparams.photon_noise_iso = 1000;
+ settings.cparams_descr = "PatchesAndNoise";
+ all_tests.push_back(settings);
+ }
+
+ {
+ RenderPipelineTestInputSettings settings;
+ settings.input_path = "jxl/grayscale_patches.png";
+ settings.xsize = 1011;
+ settings.ysize = 277;
+ settings.cparams.resampling = 2;
+ settings.cparams_descr = "PatchesAndUps2";
+ all_tests.push_back(settings);
+ }
+
+ return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const RenderPipelineTestInputSettings& c) {
+ std::string filename;
+ size_t pos = c.input_path.find_last_of('/');
+ if (pos == std::string::npos) {
+ filename = c.input_path;
+ } else {
+ filename = c.input_path.substr(pos + 1);
+ }
+ std::replace_if(
+ filename.begin(), filename.end(), [](char c) { return !isalnum(c); },
+ '_');
+ os << filename << "_" << (c.jpeg_transcode ? "JPEG_" : "") << c.xsize << "x"
+ << c.ysize << "_" << c.cparams_descr;
+ return os;
+}
+
+std::string PipelineTestDescription(
+ const testing::TestParamInfo<RenderPipelineTestParam::ParamType>& info) {
+ std::stringstream name;
+ name << info.param;
+ return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RenderPipelineTest, RenderPipelineTestParam,
+ testing::ValuesIn(GeneratePipelineTests()),
+ PipelineTestDescription);
+
+TEST(RenderPipelineDecodingTest, Animation) {
+ FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+ ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+
+ std::vector<uint8_t> compressed =
+ jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+
+ CodecInOut io_default;
+ ASSERT_TRUE(DecodeFile(Bytes(compressed),
+ /*use_slow_pipeline=*/false, &io_default, &pool));
+ CodecInOut io_slow_pipeline;
+ ASSERT_TRUE(DecodeFile(Bytes(compressed),
+ /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+ ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+ for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+ constexpr float kMaxError = 1e-5;
+#else
+ constexpr float kMaxError = 1e-4;
+#endif
+
+ Image3F fast_pipeline = std::move(*io_default.frames[i].color());
+ Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color());
+ JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError,
+ kMaxError, _))
+ for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+ ec++) {
+ JXL_ASSERT_OK(VerifyRelativeError(
+ io_slow_pipeline.frames[i].extra_channels()[ec],
+ io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+ }
+ }
+}
+
+} // namespace
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
new file mode 100644
index 0000000000..4495288860
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
@@ -0,0 +1,266 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+
+#include <hwy/base.h>
+
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void SimpleRenderPipeline::PrepareForThreadsInternal(size_t num,
+ bool use_group_ids) {
+ if (!channel_data_.empty()) {
+ return;
+ }
+ auto ch_size = [](size_t frame_size, size_t shift) {
+ return DivCeil(frame_size, 1 << shift) + kRenderPipelineXOffset * 2;
+ };
+ for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+ channel_data_.push_back(ImageF(
+ ch_size(frame_dimensions_.xsize_upsampled, channel_shifts_[0][c].first),
+ ch_size(frame_dimensions_.ysize_upsampled,
+ channel_shifts_[0][c].second)));
+ msan::PoisonImage(channel_data_.back());
+ }
+}
+
+Rect SimpleRenderPipeline::MakeChannelRect(size_t group_id, size_t channel) {
+ size_t base_color_shift =
+ CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+ frame_dimensions_.xsize_padded);
+
+ const size_t gx = group_id % frame_dimensions_.xsize_groups;
+ const size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t xgroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+ channel_shifts_[0][channel].first;
+ size_t ygroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+ channel_shifts_[0][channel].second;
+ return Rect(
+ kRenderPipelineXOffset + gx * xgroupdim,
+ kRenderPipelineXOffset + gy * ygroupdim, xgroupdim, ygroupdim,
+ kRenderPipelineXOffset + DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[0][channel].first),
+ kRenderPipelineXOffset +
+ DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[0][channel].second));
+}
+
+std::vector<std::pair<ImageF*, Rect>> SimpleRenderPipeline::PrepareBuffers(
+ size_t group_id, size_t thread_id) {
+ std::vector<std::pair<ImageF*, Rect>> ret;
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ ret.emplace_back(&channel_data_[c], MakeChannelRect(group_id, c));
+ }
+ return ret;
+}
+
+void SimpleRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) {
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ Rect r = MakeChannelRect(group_id, c);
+ (void)r;
+ JXL_CHECK_PLANE_INITIALIZED(channel_data_[c], r, c);
+ }
+
+ if (PassesWithAllInput() <= processed_passes_) return;
+ processed_passes_++;
+
+ for (size_t stage_id = 0; stage_id < stages_.size(); stage_id++) {
+ const auto& stage = stages_[stage_id];
+ // Prepare buffers for kInOut channels.
+ std::vector<ImageF> new_channels(channel_data_.size());
+ std::vector<ImageF*> output_channels(channel_data_.size());
+
+ std::vector<std::pair<size_t, size_t>> input_sizes(channel_data_.size());
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ input_sizes[c] =
+ std::make_pair(channel_data_[c].xsize() - kRenderPipelineXOffset * 2,
+ channel_data_[c].ysize() - kRenderPipelineXOffset * 2);
+ }
+
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+ continue;
+ }
+ // Ensure that the newly allocated channels are large enough to avoid
+ // problems with padding.
+ new_channels[c] =
+ ImageF(frame_dimensions_.xsize_upsampled_padded +
+ kRenderPipelineXOffset * 2 + hwy::kMaxVectorSize * 8,
+ frame_dimensions_.ysize_upsampled_padded +
+ kRenderPipelineXOffset * 2);
+ new_channels[c].ShrinkTo(
+ (input_sizes[c].first << stage->settings_.shift_x) +
+ kRenderPipelineXOffset * 2,
+ (input_sizes[c].second << stage->settings_.shift_y) +
+ kRenderPipelineXOffset * 2);
+ output_channels[c] = &new_channels[c];
+ }
+
+ auto get_row = [&](size_t c, int64_t y) {
+ return channel_data_[c].Row(kRenderPipelineXOffset + y) +
+ kRenderPipelineXOffset;
+ };
+
+ // Add mirrored pixes to all kInOut channels.
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+ continue;
+ }
+ // Horizontal mirroring.
+ for (size_t y = 0; y < input_sizes[c].second; y++) {
+ float* row = get_row(c, y);
+ for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+ *(row - ix - 1) = row[Mirror(-ssize_t(ix) - 1, input_sizes[c].first)];
+ }
+ for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+ *(row + ix + input_sizes[c].first) =
+ row[Mirror(ix + input_sizes[c].first, input_sizes[c].first)];
+ }
+ }
+ // Vertical mirroring.
+ for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+ memcpy(get_row(c, -y - 1) - stage->settings_.border_x,
+ get_row(c, Mirror(-ssize_t(y) - 1, input_sizes[c].second)) -
+ stage->settings_.border_x,
+ sizeof(float) *
+ (input_sizes[c].first + 2 * stage->settings_.border_x));
+ }
+ for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+ memcpy(
+ get_row(c, input_sizes[c].second + y) - stage->settings_.border_x,
+ get_row(c,
+ Mirror(input_sizes[c].second + y, input_sizes[c].second)) -
+ stage->settings_.border_x,
+ sizeof(float) *
+ (input_sizes[c].first + 2 * stage->settings_.border_x));
+ }
+ }
+
+ size_t ysize = 0;
+ size_t xsize = 0;
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+ continue;
+ }
+ ysize = std::max(input_sizes[c].second, ysize);
+ xsize = std::max(input_sizes[c].first, xsize);
+ }
+
+ JXL_ASSERT(ysize != 0);
+ JXL_ASSERT(xsize != 0);
+
+ RenderPipelineStage::RowInfo input_rows(channel_data_.size());
+ RenderPipelineStage::RowInfo output_rows(channel_data_.size());
+
+ // Run the pipeline.
+ {
+ stage->SetInputSizes(input_sizes);
+ int border_y = stage->settings_.border_y;
+ for (size_t y = 0; y < ysize; y++) {
+ // Prepare input rows.
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+ continue;
+ }
+ input_rows[c].resize(2 * border_y + 1);
+ for (int iy = -border_y; iy <= border_y; iy++) {
+ input_rows[c][iy + border_y] =
+ channel_data_[c].Row(y + kRenderPipelineXOffset + iy);
+ }
+ }
+ // Prepare output rows.
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ if (!output_channels[c]) continue;
+ output_rows[c].resize(1 << stage->settings_.shift_y);
+ for (size_t iy = 0; iy < output_rows[c].size(); iy++) {
+ output_rows[c][iy] = output_channels[c]->Row(
+ (y << stage->settings_.shift_y) + iy + kRenderPipelineXOffset);
+ }
+ }
+ stage->ProcessRow(input_rows, output_rows, /*xextra=*/0, xsize,
+ /*xpos=*/0, y, thread_id);
+ }
+ }
+
+ // Move new channels to current channels.
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+ continue;
+ }
+ channel_data_[c] = std::move(new_channels[c]);
+ }
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ size_t next_stage = std::min(stage_id + 1, channel_shifts_.size() - 1);
+ size_t xsize = DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[next_stage][c].first);
+ size_t ysize = DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[next_stage][c].second);
+ channel_data_[c].ShrinkTo(xsize + 2 * kRenderPipelineXOffset,
+ ysize + 2 * kRenderPipelineXOffset);
+ JXL_CHECK_PLANE_INITIALIZED(
+ channel_data_[c],
+ Rect(kRenderPipelineXOffset, kRenderPipelineXOffset, xsize, ysize),
+ c);
+ }
+
+ if (stage->SwitchToImageDimensions()) {
+ size_t image_xsize, image_ysize;
+ FrameOrigin frame_origin;
+ stage->GetImageDimensions(&image_xsize, &image_ysize, &frame_origin);
+ frame_dimensions_.Set(image_xsize, image_ysize, 0, 0, 0, false, 1);
+ std::vector<ImageF> old_channels = std::move(channel_data_);
+ channel_data_.clear();
+ channel_data_.reserve(old_channels.size());
+ for (size_t c = 0; c < old_channels.size(); c++) {
+ channel_data_.emplace_back(2 * kRenderPipelineXOffset + image_xsize,
+ 2 * kRenderPipelineXOffset + image_ysize);
+ }
+ for (size_t y = 0; y < image_ysize; ++y) {
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ output_rows[c].resize(1);
+ output_rows[c][0] = channel_data_[c].Row(kRenderPipelineXOffset + y);
+ }
+ // TODO(sboukortt): consider doing this only on the parts of the
+ // background that won't be occluded.
+ stage->ProcessPaddingRow(output_rows, image_xsize, 0, y);
+ }
+ ssize_t x0 = frame_origin.x0;
+ ssize_t y0 = frame_origin.y0;
+ size_t x0_fg = 0;
+ size_t y0_fg = 0;
+ if (x0 < 0) {
+ xsize += x0;
+ x0_fg -= x0;
+ x0 = 0;
+ }
+ if (x0 + xsize > image_xsize) {
+ xsize = image_xsize - x0;
+ }
+ if (y0 < 0) {
+ ysize += y0;
+ y0_fg -= x0;
+ y0 = 0;
+ }
+ if (y0 + ysize > image_ysize) {
+ ysize = image_ysize - y0;
+ }
+ const Rect rect_fg_relative_to_image =
+ Rect(x0, y0, xsize, ysize)
+ .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+ const Rect rect_fg =
+ Rect(x0_fg, y0_fg, xsize, ysize)
+ .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+ for (size_t c = 0; c < channel_data_.size(); c++) {
+ CopyImageTo(rect_fg, old_channels[c], rect_fg_relative_to_image,
+ &channel_data_[c]);
+ }
+ }
+ }
+}
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
new file mode 100644
index 0000000000..10f4505912
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A RenderPipeline that is "obviously correct"; it may use potentially large
+// amounts of memory and be slow. It is intended to be used mostly for testing
+// purposes.
+class SimpleRenderPipeline : public RenderPipeline {
+ std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+ size_t group_id, size_t thread_id) override;
+
+ void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+ void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+ // Full frame buffers. Both X and Y dimensions are padded by
+ // kRenderPipelineXOffset.
+ std::vector<ImageF> channel_data_;
+ size_t processed_passes_ = 0;
+
+ private:
+ Rect MakeChannelRect(size_t group_id, size_t channel);
+};
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
new file mode 100644
index 0000000000..b68105f4c9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
@@ -0,0 +1,250 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_blending.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_blending.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/blending.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class BlendingStage : public RenderPipelineStage {
+ public:
+ explicit BlendingStage(const FrameHeader& frame_header,
+ const PassesDecoderState* dec_state,
+ const ColorEncoding& frame_color_encoding)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ frame_header_(frame_header),
+ state_(*dec_state->shared) {
+ image_xsize_ = frame_header_.nonserialized_metadata->xsize();
+ image_ysize_ = frame_header_.nonserialized_metadata->ysize();
+ extra_channel_info_ =
+ &frame_header_.nonserialized_metadata->m.extra_channel_info;
+ info_ = frame_header_.blending_info;
+ const std::vector<BlendingInfo>& ec_info =
+ frame_header_.extra_channel_blending_info;
+ const ImageBundle& bg = state_.reference_frames[info_.source].frame;
+ bg_ = &bg;
+ if (bg.xsize() == 0 || bg.ysize() == 0) {
+ zeroes_.resize(image_xsize_, 0.f);
+ } else if (state_.reference_frames[info_.source].ib_is_in_xyb) {
+ initialized_ = JXL_FAILURE(
+ "Trying to blend XYB reference frame %i and non-XYB frame",
+ info_.source);
+ return;
+ } else if (std::any_of(ec_info.begin(), ec_info.end(),
+ [this](const BlendingInfo& info) {
+ const ImageBundle& bg =
+ state_.reference_frames[info.source].frame;
+ return bg.xsize() == 0 || bg.ysize() == 0;
+ })) {
+ zeroes_.resize(image_xsize_, 0.f);
+ }
+
+ auto verify_bg_size = [&](const ImageBundle& bg) -> Status {
+ if (bg.xsize() != 0 && bg.ysize() != 0 &&
+ (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ ||
+ bg.origin.x0 != 0 || bg.origin.y0 != 0)) {
+ return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS
+ " crop as a background",
+ bg.xsize(), bg.ysize());
+ }
+ return true;
+ };
+
+ Status ok = verify_bg_size(bg);
+ for (const auto& info : ec_info) {
+ const ImageBundle& bg = state_.reference_frames[info.source].frame;
+ if (!!ok) ok = verify_bg_size(bg);
+ }
+ if (!ok) {
+ initialized_ = ok;
+ return;
+ }
+
+ if (state_.metadata->m.xyb_encoded) {
+ if (!dec_state->output_encoding_info.color_encoding_is_original) {
+ initialized_ = JXL_FAILURE("Blending in unsupported color space");
+ return;
+ }
+ }
+
+ blending_info_.resize(ec_info.size() + 1);
+ auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) {
+ pb->alpha_channel = info.alpha_channel;
+ pb->clamp = info.clamp;
+ switch (info.mode) {
+ case BlendMode::kReplace: {
+ pb->mode = PatchBlendMode::kReplace;
+ break;
+ }
+ case BlendMode::kAdd: {
+ pb->mode = PatchBlendMode::kAdd;
+ break;
+ }
+ case BlendMode::kMul: {
+ pb->mode = PatchBlendMode::kMul;
+ break;
+ }
+ case BlendMode::kBlend: {
+ pb->mode = PatchBlendMode::kBlendAbove;
+ break;
+ }
+ case BlendMode::kAlphaWeightedAdd: {
+ pb->mode = PatchBlendMode::kAlphaWeightedAddAbove;
+ break;
+ }
+ default: {
+ JXL_UNREACHABLE(
+ "Invalid blend mode"); // should have failed to decode
+ }
+ }
+ };
+ make_blending(info_, &blending_info_[0]);
+ for (size_t i = 0; i < ec_info.size(); i++) {
+ make_blending(ec_info[i], &blending_info_[1 + i]);
+ }
+ }
+
+ Status IsInitialized() const override { return initialized_; }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ JXL_ASSERT(initialized_);
+ const FrameOrigin& frame_origin = frame_header_.frame_origin;
+ ssize_t bg_xpos = frame_origin.x0 + static_cast<ssize_t>(xpos);
+ ssize_t bg_ypos = frame_origin.y0 + static_cast<ssize_t>(ypos);
+ int offset = 0;
+ if (bg_xpos + static_cast<ssize_t>(xsize) <= 0 ||
+ frame_origin.x0 >= static_cast<ssize_t>(image_xsize_) || bg_ypos < 0 ||
+ bg_ypos >= static_cast<ssize_t>(image_ysize_)) {
+ return;
+ }
+ if (bg_xpos < 0) {
+ offset -= bg_xpos;
+ xsize += bg_xpos;
+ bg_xpos = 0;
+ }
+ if (bg_xpos + xsize > image_xsize_) {
+ xsize =
+ std::max<ssize_t>(0, static_cast<ssize_t>(image_xsize_) - bg_xpos);
+ }
+ std::vector<const float*> bg_row_ptrs_(input_rows.size());
+ std::vector<float*> fg_row_ptrs_(input_rows.size());
+ size_t num_c = std::min(input_rows.size(), extra_channel_info_->size() + 3);
+ for (size_t c = 0; c < num_c; ++c) {
+ fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset;
+ if (c < 3) {
+ bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0
+ ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos
+ : zeroes_.data();
+ } else {
+ const ImageBundle& ec_bg =
+ state_
+ .reference_frames
+ [frame_header_.extra_channel_blending_info[c - 3].source]
+ .frame;
+ bg_row_ptrs_[c] =
+ ec_bg.xsize() != 0 && ec_bg.ysize() != 0
+ ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos
+ : zeroes_.data();
+ }
+ }
+ PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(),
+ fg_row_ptrs_.data(), 0, xsize, blending_info_[0],
+ blending_info_.data() + 1, *extra_channel_info_);
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return RenderPipelineChannelMode::kInPlace;
+ }
+
+ bool SwitchToImageDimensions() const override { return true; }
+
+ void GetImageDimensions(size_t* xsize, size_t* ysize,
+ FrameOrigin* frame_origin) const override {
+ *xsize = image_xsize_;
+ *ysize = image_ysize_;
+ *frame_origin = frame_header_.frame_origin;
+ }
+
+ void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, size_t xpos,
+ size_t ypos) const override {
+ if (bg_->xsize() == 0 || bg_->ysize() == 0) {
+ for (size_t c = 0; c < 3; ++c) {
+ memset(GetInputRow(output_rows, c, 0), 0, xsize * sizeof(float));
+ }
+ } else {
+ for (size_t c = 0; c < 3; ++c) {
+ memcpy(GetInputRow(output_rows, c, 0),
+ bg_->color().ConstPlaneRow(c, ypos) + xpos,
+ xsize * sizeof(float));
+ }
+ }
+ for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) {
+ const ImageBundle& ec_bg =
+ state_
+ .reference_frames[frame_header_.extra_channel_blending_info[ec]
+ .source]
+ .frame;
+ if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) {
+ memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float));
+ } else {
+ memcpy(GetInputRow(output_rows, 3 + ec, 0),
+ ec_bg.extra_channels()[ec].ConstRow(ypos) + xpos,
+ xsize * sizeof(float));
+ }
+ }
+ }
+
+ const char* GetName() const override { return "Blending"; }
+
+ private:
+ const FrameHeader& frame_header_;
+ const PassesSharedState& state_;
+ BlendingInfo info_;
+ const ImageBundle* bg_;
+ Status initialized_ = true;
+ size_t image_xsize_;
+ size_t image_ysize_;
+ std::vector<PatchBlending> blending_info_;
+ const std::vector<ExtraChannelInfo>* extra_channel_info_;
+ std::vector<float> zeroes_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+ const FrameHeader& frame_header, const PassesDecoderState* dec_state,
+ const ColorEncoding& frame_color_encoding) {
+ return jxl::make_unique<BlendingStage>(frame_header, dec_state,
+ frame_color_encoding);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetBlendingStage);
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+ const FrameHeader& frame_header, const PassesDecoderState* dec_state,
+ const ColorEncoding& frame_color_encoding) {
+ return HWY_DYNAMIC_DISPATCH(GetBlendingStage)(frame_header, dec_state,
+ frame_color_encoding);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
new file mode 100644
index 0000000000..aedc8c2e99
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+
+#include <memory>
+
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies blending if applicable.
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+ const FrameHeader& frame_header, const PassesDecoderState* dec_state,
+ const ColorEncoding& frame_color_encoding);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
new file mode 100644
index 0000000000..936fbd3a44
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
@@ -0,0 +1,127 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_chroma_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class HorizontalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+ explicit HorizontalChromaUpsamplingStage(size_t channel)
+ : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(
+ /*shift=*/1, /*border=*/1)),
+ c_(channel) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ HWY_FULL(float) df;
+ xextra = RoundUpTo(xextra, Lanes(df));
+ auto threefour = Set(df, 0.75f);
+ auto onefour = Set(df, 0.25f);
+ const float* row_in = GetInputRow(input_rows, c_, 0);
+ float* row_out = GetOutputRow(output_rows, c_, 0);
+ for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+ x += Lanes(df)) {
+ auto current = Mul(LoadU(df, row_in + x), threefour);
+ auto prev = LoadU(df, row_in + x - 1);
+ auto next = LoadU(df, row_in + x + 1);
+ auto left = MulAdd(onefour, prev, current);
+ auto right = MulAdd(onefour, next, current);
+ StoreInterleaved(df, left, right, row_out + x * 2);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c == c_ ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "HChromaUps"; }
+
+ private:
+ size_t c_;
+};
+
+class VerticalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+ explicit VerticalChromaUpsamplingStage(size_t channel)
+ : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(
+ /*shift=*/1, /*border=*/1)),
+ c_(channel) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ HWY_FULL(float) df;
+ xextra = RoundUpTo(xextra, Lanes(df));
+ auto threefour = Set(df, 0.75f);
+ auto onefour = Set(df, 0.25f);
+ const float* row_top = GetInputRow(input_rows, c_, -1);
+ const float* row_mid = GetInputRow(input_rows, c_, 0);
+ const float* row_bot = GetInputRow(input_rows, c_, 1);
+ float* row_out0 = GetOutputRow(output_rows, c_, 0);
+ float* row_out1 = GetOutputRow(output_rows, c_, 1);
+ for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+ x += Lanes(df)) {
+ auto it = LoadU(df, row_top + x);
+ auto im = LoadU(df, row_mid + x);
+ auto ib = LoadU(df, row_bot + x);
+ auto im_scaled = Mul(im, threefour);
+ Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+ Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c == c_ ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "VChromaUps"; }
+
+ private:
+ size_t c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+ bool horizontal) {
+ if (horizontal) {
+ return jxl::make_unique<HorizontalChromaUpsamplingStage>(channel);
+ } else {
+ return jxl::make_unique<VerticalChromaUpsamplingStage>(channel);
+ }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetChromaUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+ bool horizontal) {
+ return HWY_DYNAMIC_DISPATCH(GetChromaUpsamplingStage)(channel, horizontal);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
new file mode 100644
index 0000000000..b4d0cbdfd3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies simple upsampling, either horizontal or vertical, to the given
+// channel.
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+ bool horizontal);
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc
new file mode 100644
index 0000000000..2465146b47
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc
@@ -0,0 +1,134 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_cms.h"
+
+#include <memory>
+
+#include "jxl/cms_interface.h"
+#include "jxl/color_encoding.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_xyb.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_cms.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class CmsStage : public RenderPipelineStage {
+ public:
+ explicit CmsStage(OutputEncodingInfo output_encoding_info)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ output_encoding_info_(std::move(output_encoding_info)) {
+ c_src_ = output_encoding_info_.linear_color_encoding;
+ }
+
+ bool IsNeeded() const {
+ const size_t channels_src = (c_src_.IsCMYK() ? 4 : c_src_.Channels());
+ const size_t channels_dst = output_encoding_info_.color_encoding.Channels();
+ const bool not_mixing_color_and_grey =
+ (channels_src == channels_dst ||
+ (channels_src == 4 && channels_dst == 3));
+ return (output_encoding_info_.cms_set) &&
+ !c_src_.SameColorEncoding(output_encoding_info_.color_encoding) &&
+ not_mixing_color_and_grey;
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ JXL_ASSERT(xsize == xsize_);
+ // TODO(firsching): handle grey case seperately
+ // interleave
+ float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+ float* mutable_buf_src = color_space_transform->BufSrc(thread_id);
+
+ for (size_t x = 0; x < xsize; x++) {
+ mutable_buf_src[3 * x + 0] = row0[x];
+ mutable_buf_src[3 * x + 1] = row1[x];
+ mutable_buf_src[3 * x + 2] = row2[x];
+ }
+ const float* buf_src = mutable_buf_src;
+ float* JXL_RESTRICT buf_dst = color_space_transform->BufDst(thread_id);
+ if (!color_space_transform->Run(thread_id, buf_src, buf_dst)) {
+ // TODO(firsching): somehow mark failing here?
+ return;
+ }
+ // de-interleave
+ for (size_t x = 0; x < xsize; x++) {
+ row0[x] = buf_dst[3 * x + 0];
+ row1[x] = buf_dst[3 * x + 1];
+ row2[x] = buf_dst[3 * x + 2];
+ }
+ }
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "Cms"; }
+
+ private:
+ OutputEncodingInfo output_encoding_info_;
+ size_t xsize_;
+ std::unique_ptr<jxl::ColorSpaceTransform> color_space_transform;
+ ColorEncoding c_src_;
+
+ void SetInputSizes(
+ const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+ JXL_ASSERT(input_sizes.size() >= 3);
+ for (size_t c = 1; c < input_sizes.size(); c++) {
+ JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+ JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+ }
+#endif
+ xsize_ = input_sizes[0].first;
+ }
+
+ Status PrepareForThreads(size_t num_threads) override {
+ color_space_transform = jxl::make_unique<jxl::ColorSpaceTransform>(
+ output_encoding_info_.color_management_system);
+ JXL_RETURN_IF_ERROR(color_space_transform->Init(
+ c_src_, output_encoding_info_.color_encoding,
+ output_encoding_info_.desired_intensity_target, xsize_, num_threads));
+ return true;
+ }
+};
+
+std::unique_ptr<RenderPipelineStage> GetCmsStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ auto stage = jxl::make_unique<CmsStage>(output_encoding_info);
+ if (!stage->IsNeeded()) return nullptr;
+ return stage;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetCmsStage);
+
+std::unique_ptr<RenderPipelineStage> GetCmsStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ return HWY_DYNAMIC_DISPATCH(GetCmsStage)(output_encoding_info);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h
new file mode 100644
index 0000000000..23277ae6f7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_
+
+#include <memory>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+std::unique_ptr<RenderPipelineStage> GetCmsStage(
+ const OutputEncodingInfo& output_encoding_info);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
new file mode 100644
index 0000000000..5d1a379ede
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
@@ -0,0 +1,526 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_epf.h"
+
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/common.h" // JXL_HIGH_PRECISION
+#include "lib/jxl/epf.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_epf.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+// TODO(veluca): In principle, vectors could be not capped, if we want to deal
+// with having two different sigma values in a single vector.
+using DF = HWY_CAPPED(float, 8);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::VFromD;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+JXL_INLINE Vec<DF> Weight(Vec<DF> sad, Vec<DF> inv_sigma, Vec<DF> thres) {
+ auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f));
+ return ZeroIfNegative(v);
+}
+
+// 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 plus-shaped). So this makes
+// this filter a 7x7 filter.
+class EPF0Stage : public RenderPipelineStage {
+ public:
+ EPF0Stage(const LoopFilter& lf, const ImageF& sigma)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/0, /*border=*/3)),
+ lf_(lf),
+ sigma_(&sigma) {}
+
+ template <bool aligned>
+ JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][7], ssize_t x,
+ Vec<DF> sad, Vec<DF> inv_sigma,
+ Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+ Vec<DF>* JXL_RESTRICT B,
+ Vec<DF>* JXL_RESTRICT w) const {
+ auto cx = aligned ? Load(DF(), rows[0][3 + row] + x)
+ : LoadU(DF(), rows[0][3 + row] + x);
+ auto cy = aligned ? Load(DF(), rows[1][3 + row] + x)
+ : LoadU(DF(), rows[1][3 + row] + x);
+ auto cb = aligned ? Load(DF(), rows[2][3 + row] + x)
+ : LoadU(DF(), rows[2][3 + row] + x);
+
+ auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+ *w = Add(*w, weight);
+ *X = MulAdd(weight, cx, *X);
+ *Y = MulAdd(weight, cy, *Y);
+ *B = MulAdd(weight, cb, *B);
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ DF df;
+
+ using V = decltype(Zero(df));
+ V t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA, tB;
+ V* sads[12] = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA, &tB};
+
+ xextra = RoundUpTo(xextra, Lanes(df));
+ const float* JXL_RESTRICT row_sigma =
+ sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+ float sm = lf_.epf_pass0_sigma_scale * 1.65;
+ float bsm = sm * lf_.epf_border_sad_mul;
+
+ HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+ sm, sm, sm, bsm};
+ HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+ bsm, bsm, bsm, bsm};
+ float* JXL_RESTRICT rows[3][7];
+ for (size_t c = 0; c < 3; c++) {
+ for (int i = 0; i < 7; i++) {
+ rows[c][i] = GetInputRow(input_rows, c, i - 3);
+ }
+ }
+
+ const float* sad_mul =
+ (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+ ? sad_mul_border
+ : sad_mul_center;
+
+ for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+ x += Lanes(df)) {
+ size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+ size_t ix = (x + xpos) % kBlockDim;
+
+ if (row_sigma[bx] < kMinSigma) {
+ for (size_t c = 0; c < 3; c++) {
+ auto px = Load(df, rows[c][3 + 0] + x);
+ StoreU(px, df, GetOutputRow(output_rows, c, 0) + x);
+ }
+ continue;
+ }
+
+ const auto sm = Load(df, sad_mul + ix);
+ const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+ for (size_t i = 0; i < 12; i++) *sads[i] = Zero(df);
+ constexpr std::array<int, 2> sads_off[12] = {
+ {{-2, 0}}, {{-1, -1}}, {{-1, 0}}, {{-1, 1}}, {{0, -2}}, {{0, -1}},
+ {{0, 1}}, {{0, 2}}, {{1, -1}}, {{1, 0}}, {{1, 1}}, {{2, 0}},
+ };
+
+ // compute sads
+ // TODO(veluca): consider unrolling and optimizing this.
+ for (size_t c = 0; c < 3; c++) {
+ auto scale = Set(df, lf_.epf_channel_scale[c]);
+ for (size_t i = 0; i < 12; i++) {
+ auto sad = Zero(df);
+ constexpr std::array<int, 2> plus_off[] = {
+ {{0, 0}}, {{-1, 0}}, {{0, -1}}, {{1, 0}}, {{0, 1}}};
+ for (size_t j = 0; j < 5; j++) {
+ const auto r11 =
+ LoadU(df, rows[c][3 + plus_off[j][0]] + x + plus_off[j][1]);
+ const auto c11 =
+ LoadU(df, rows[c][3 + sads_off[i][0] + plus_off[j][0]] + x +
+ sads_off[i][1] + plus_off[j][1]);
+ sad = Add(sad, AbsDiff(r11, c11));
+ }
+ *sads[i] = MulAdd(sad, scale, *sads[i]);
+ }
+ }
+ const auto x_cc = Load(df, rows[0][3 + 0] + x);
+ const auto y_cc = Load(df, rows[1][3 + 0] + x);
+ const auto b_cc = Load(df, rows[2][3 + 0] + x);
+
+ auto w = Set(df, 1);
+ auto X = x_cc;
+ auto Y = y_cc;
+ auto B = b_cc;
+
+ for (size_t i = 0; i < 12; i++) {
+ AddPixel</*aligned=*/false>(/*row=*/sads_off[i][0], rows,
+ x + sads_off[i][1], *sads[i], inv_sigma, &X,
+ &Y, &B, &w);
+ }
+#if JXL_HIGH_PRECISION
+ auto inv_w = Div(Set(df, 1.0f), w);
+#else
+ auto inv_w = ApproximateReciprocal(w);
+#endif
+ StoreU(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+ StoreU(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+ StoreU(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "EPF0"; }
+
+ private:
+ LoopFilter lf_;
+ const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 plus-shaped). So this
+// makes this filter a 5x5 filter.
+class EPF1Stage : public RenderPipelineStage {
+ public:
+ EPF1Stage(const LoopFilter& lf, const ImageF& sigma)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/0, /*border=*/2)),
+ lf_(lf),
+ sigma_(&sigma) {}
+
+ template <bool aligned>
+ JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][5], ssize_t x,
+ Vec<DF> sad, Vec<DF> inv_sigma,
+ Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+ Vec<DF>* JXL_RESTRICT B,
+ Vec<DF>* JXL_RESTRICT w) const {
+ auto cx = aligned ? Load(DF(), rows[0][2 + row] + x)
+ : LoadU(DF(), rows[0][2 + row] + x);
+ auto cy = aligned ? Load(DF(), rows[1][2 + row] + x)
+ : LoadU(DF(), rows[1][2 + row] + x);
+ auto cb = aligned ? Load(DF(), rows[2][2 + row] + x)
+ : LoadU(DF(), rows[2][2 + row] + x);
+
+ auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+ *w = Add(*w, weight);
+ *X = MulAdd(weight, cx, *X);
+ *Y = MulAdd(weight, cy, *Y);
+ *B = MulAdd(weight, cb, *B);
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ DF df;
+ xextra = RoundUpTo(xextra, Lanes(df));
+ const float* JXL_RESTRICT row_sigma =
+ sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+ float sm = 1.65f;
+ float bsm = sm * lf_.epf_border_sad_mul;
+
+ HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+ sm, sm, sm, bsm};
+ HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+ bsm, bsm, bsm, bsm};
+
+ float* JXL_RESTRICT rows[3][5];
+ for (size_t c = 0; c < 3; c++) {
+ for (int i = 0; i < 5; i++) {
+ rows[c][i] = GetInputRow(input_rows, c, i - 2);
+ }
+ }
+
+ const float* sad_mul =
+ (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+ ? sad_mul_border
+ : sad_mul_center;
+
+ for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+ x += Lanes(df)) {
+ size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+ size_t ix = (x + xpos) % kBlockDim;
+
+ if (row_sigma[bx] < kMinSigma) {
+ for (size_t c = 0; c < 3; c++) {
+ auto px = Load(df, rows[c][2 + 0] + x);
+ Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+ }
+ continue;
+ }
+
+ const auto sm = Load(df, sad_mul + ix);
+ const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+ auto sad0 = Zero(df);
+ auto sad1 = Zero(df);
+ auto sad2 = Zero(df);
+ auto sad3 = Zero(df);
+
+ // compute sads
+ for (size_t c = 0; c < 3; c++) {
+ // center px = 22, px above = 21
+ auto t = Undefined(df);
+
+ const auto p20 = Load(df, rows[c][2 + -2] + x);
+ const auto p21 = Load(df, rows[c][2 + -1] + x);
+ auto sad0c = AbsDiff(p20, p21); // SAD 2, 1
+
+ const auto p11 = LoadU(df, rows[c][2 + -1] + x - 1);
+ auto sad1c = AbsDiff(p11, p21); // SAD 1, 2
+
+ const auto p31 = LoadU(df, rows[c][2 + -1] + x + 1);
+ auto sad2c = AbsDiff(p31, p21); // SAD 3, 2
+
+ const auto p02 = LoadU(df, rows[c][2 + 0] + x - 2);
+ const auto p12 = LoadU(df, rows[c][2 + 0] + x - 1);
+ sad1c = Add(sad1c, AbsDiff(p02, p12)); // SAD 1, 2
+ sad0c = Add(sad0c, AbsDiff(p11, p12)); // SAD 2, 1
+
+ const auto p22 = LoadU(df, rows[c][2 + 0] + x);
+ t = AbsDiff(p12, p22);
+ sad1c = Add(sad1c, t); // SAD 1, 2
+ sad2c = Add(sad2c, t); // SAD 3, 2
+ t = AbsDiff(p22, p21);
+ auto sad3c = t; // SAD 2, 3
+ sad0c = Add(sad0c, t); // SAD 2, 1
+
+ const auto p32 = LoadU(df, rows[c][2 + 0] + x + 1);
+ sad0c = Add(sad0c, AbsDiff(p31, p32)); // SAD 2, 1
+ t = AbsDiff(p22, p32);
+ sad1c = Add(sad1c, t); // SAD 1, 2
+ sad2c = Add(sad2c, t); // SAD 3, 2
+
+ const auto p42 = LoadU(df, rows[c][2 + 0] + x + 2);
+ sad2c = Add(sad2c, AbsDiff(p42, p32)); // SAD 3, 2
+
+ const auto p13 = LoadU(df, rows[c][2 + 1] + x - 1);
+ sad3c = Add(sad3c, AbsDiff(p13, p12)); // SAD 2, 3
+
+ const auto p23 = Load(df, rows[c][2 + 1] + x);
+ t = AbsDiff(p22, p23);
+ sad0c = Add(sad0c, t); // SAD 2, 1
+ sad3c = Add(sad3c, t); // SAD 2, 3
+ sad1c = Add(sad1c, AbsDiff(p13, p23)); // SAD 1, 2
+
+ const auto p33 = LoadU(df, rows[c][2 + 1] + x + 1);
+ sad2c = Add(sad2c, AbsDiff(p33, p23)); // SAD 3, 2
+ sad3c = Add(sad3c, AbsDiff(p33, p32)); // SAD 2, 3
+
+ const auto p24 = Load(df, rows[c][2 + 2] + x);
+ sad3c = Add(sad3c, AbsDiff(p24, p23)); // SAD 2, 3
+
+ auto scale = Set(df, lf_.epf_channel_scale[c]);
+ sad0 = MulAdd(sad0c, scale, sad0);
+ sad1 = MulAdd(sad1c, scale, sad1);
+ sad2 = MulAdd(sad2c, scale, sad2);
+ sad3 = MulAdd(sad3c, scale, sad3);
+ }
+ const auto x_cc = Load(df, rows[0][2 + 0] + x);
+ const auto y_cc = Load(df, rows[1][2 + 0] + x);
+ const auto b_cc = Load(df, rows[2][2 + 0] + x);
+
+ auto w = Set(df, 1);
+ auto X = x_cc;
+ auto Y = y_cc;
+ auto B = b_cc;
+
+ // Top row
+ AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, sad0, inv_sigma, &X, &Y,
+ &B, &w);
+ // Center
+ AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, sad1, inv_sigma, &X,
+ &Y, &B, &w);
+ AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, sad2, inv_sigma, &X,
+ &Y, &B, &w);
+ // Bottom
+ AddPixel</*aligned=*/true>(/*row=*/1, rows, x, sad3, inv_sigma, &X, &Y,
+ &B, &w);
+#if JXL_HIGH_PRECISION
+ auto inv_w = Div(Set(df, 1.0f), w);
+#else
+ auto inv_w = ApproximateReciprocal(w);
+#endif
+ Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+ Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+ Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "EPF1"; }
+
+ private:
+ LoopFilter lf_;
+ const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 1 SAD per pixel. So this makes this filter a 3x3
+// filter.
+class EPF2Stage : public RenderPipelineStage {
+ public:
+ EPF2Stage(const LoopFilter& lf, const ImageF& sigma)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/0, /*border=*/1)),
+ lf_(lf),
+ sigma_(&sigma) {}
+
+ template <bool aligned>
+ JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][3], ssize_t x,
+ Vec<DF> rx, Vec<DF> ry, Vec<DF> rb,
+ Vec<DF> inv_sigma, Vec<DF>* JXL_RESTRICT X,
+ Vec<DF>* JXL_RESTRICT Y, Vec<DF>* JXL_RESTRICT B,
+ Vec<DF>* JXL_RESTRICT w) const {
+ auto cx = aligned ? Load(DF(), rows[0][1 + row] + x)
+ : LoadU(DF(), rows[0][1 + row] + x);
+ auto cy = aligned ? Load(DF(), rows[1][1 + row] + x)
+ : LoadU(DF(), rows[1][1 + row] + x);
+ auto cb = aligned ? Load(DF(), rows[2][1 + row] + x)
+ : LoadU(DF(), rows[2][1 + row] + x);
+
+ auto sad = Mul(AbsDiff(cx, rx), Set(DF(), lf_.epf_channel_scale[0]));
+ sad = MulAdd(AbsDiff(cy, ry), Set(DF(), lf_.epf_channel_scale[1]), sad);
+ sad = MulAdd(AbsDiff(cb, rb), Set(DF(), lf_.epf_channel_scale[2]), sad);
+
+ auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass2_zeroflush));
+
+ *w = Add(*w, weight);
+ *X = MulAdd(weight, cx, *X);
+ *Y = MulAdd(weight, cy, *Y);
+ *B = MulAdd(weight, cb, *B);
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ DF df;
+ xextra = RoundUpTo(xextra, Lanes(df));
+ const float* JXL_RESTRICT row_sigma =
+ sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+ float sm = lf_.epf_pass2_sigma_scale * 1.65;
+ float bsm = sm * lf_.epf_border_sad_mul;
+
+ HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+ sm, sm, sm, bsm};
+ HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+ bsm, bsm, bsm, bsm};
+
+ float* JXL_RESTRICT rows[3][3];
+ for (size_t c = 0; c < 3; c++) {
+ for (int i = 0; i < 3; i++) {
+ rows[c][i] = GetInputRow(input_rows, c, i - 1);
+ }
+ }
+
+ const float* sad_mul =
+ (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+ ? sad_mul_border
+ : sad_mul_center;
+
+ for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+ x += Lanes(df)) {
+ size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+ size_t ix = (x + xpos) % kBlockDim;
+
+ if (row_sigma[bx] < kMinSigma) {
+ for (size_t c = 0; c < 3; c++) {
+ auto px = Load(df, rows[c][1 + 0] + x);
+ Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+ }
+ continue;
+ }
+
+ const auto sm = Load(df, sad_mul + ix);
+ const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+ const auto x_cc = Load(df, rows[0][1 + 0] + x);
+ const auto y_cc = Load(df, rows[1][1 + 0] + x);
+ const auto b_cc = Load(df, rows[2][1 + 0] + x);
+
+ auto w = Set(df, 1);
+ auto X = x_cc;
+ auto Y = y_cc;
+ auto B = b_cc;
+
+ // Top row
+ AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, x_cc, y_cc, b_cc,
+ inv_sigma, &X, &Y, &B, &w);
+ // Center
+ AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc,
+ inv_sigma, &X, &Y, &B, &w);
+ AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc,
+ inv_sigma, &X, &Y, &B, &w);
+ // Bottom
+ AddPixel</*aligned=*/true>(/*row=*/1, rows, x, x_cc, y_cc, b_cc,
+ inv_sigma, &X, &Y, &B, &w);
+#if JXL_HIGH_PRECISION
+ auto inv_w = Div(Set(df, 1.0f), w);
+#else
+ auto inv_w = ApproximateReciprocal(w);
+#endif
+ Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+ Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+ Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "EPF2"; }
+
+ private:
+ LoopFilter lf_;
+ const ImageF* sigma_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage0(const LoopFilter& lf,
+ const ImageF& sigma) {
+ return jxl::make_unique<EPF0Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage1(const LoopFilter& lf,
+ const ImageF& sigma) {
+ return jxl::make_unique<EPF1Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage2(const LoopFilter& lf,
+ const ImageF& sigma) {
+ return jxl::make_unique<EPF2Stage>(lf, sigma);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetEPFStage0);
+HWY_EXPORT(GetEPFStage1);
+HWY_EXPORT(GetEPFStage2);
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+ const ImageF& sigma,
+ size_t epf_stage) {
+ JXL_ASSERT(lf.epf_iters != 0);
+ switch (epf_stage) {
+ case 0:
+ return HWY_DYNAMIC_DISPATCH(GetEPFStage0)(lf, sigma);
+ case 1:
+ return HWY_DYNAMIC_DISPATCH(GetEPFStage1)(lf, sigma);
+ case 2:
+ return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma);
+ default:
+ JXL_UNREACHABLE("Invalid EPF stage");
+ }
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
new file mode 100644
index 0000000000..c9d0d0c785
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies the `epf_stage`-th EPF step with the given settings and `sigma`.
+// `sigma` will be accessed with an offset of (kSigmaPadding, kSigmaPadding),
+// and should have (kSigmaBorder, kSigmaBorder) mirrored sigma values available
+// around the main image. See also filters.(h|cc)
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+ const ImageF& sigma,
+ size_t epf_stage);
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
new file mode 100644
index 0000000000..6b1f646cd5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
@@ -0,0 +1,194 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_from_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_from_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
+#include "lib/jxl/common.h" // JXL_HIGH_PRECISION
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+ explicit PerChannelOp(Op op) : op(op) {}
+ template <typename D, typename T>
+ void Transform(D d, T* r, T* g, T* b) const {
+ *r = op.Transform(d, *r);
+ *g = op.Transform(d, *g);
+ *b = op.Transform(d, *b);
+ }
+
+ Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+ return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+ template <typename D, typename T>
+ T Transform(D d, const T& linear) const {
+ return linear;
+ }
+};
+
+struct OpRgb {
+ template <typename D, typename T>
+ T Transform(D d, const T& linear) const {
+#if JXL_HIGH_PRECISION
+ return TF_SRGB().EncodedFromDisplay(d, linear);
+#else
+ return FastLinearToSRGB(d, linear);
+#endif
+ }
+};
+
+struct OpPq {
+ explicit OpPq(const float intensity_target) : tf_pq_(intensity_target) {}
+ template <typename D, typename T>
+ T Transform(D d, const T& linear) const {
+ return tf_pq_.EncodedFromDisplay(d, linear);
+ }
+ TF_PQ tf_pq_;
+};
+
+struct OpHlg {
+ explicit OpHlg(const float luminances[3], const float intensity_target)
+ : hlg_ootf_(HlgOOTF::ToSceneLight(/*display_luminance=*/intensity_target,
+ luminances)) {}
+
+ template <typename D, typename T>
+ void Transform(D d, T* r, T* g, T* b) const {
+ hlg_ootf_.Apply(r, g, b);
+ *r = TF_HLG().EncodedFromDisplay(d, *r);
+ *g = TF_HLG().EncodedFromDisplay(d, *g);
+ *b = TF_HLG().EncodedFromDisplay(d, *b);
+ }
+ HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+ template <typename D, typename T>
+ T Transform(D d, const T& linear) const {
+ return TF_709().EncodedFromDisplay(d, linear);
+ }
+};
+
+struct OpGamma {
+ const float inverse_gamma;
+ template <typename D, typename T>
+ T Transform(D d, const T& linear) const {
+ return IfThenZeroElse(Le(linear, Set(d, 1e-5f)),
+ FastPowf(d, linear, Set(d, inverse_gamma)));
+ }
+};
+
+template <typename Op>
+class FromLinearStage : public RenderPipelineStage {
+ public:
+ explicit FromLinearStage(Op op)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ op_(std::move(op)) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ const HWY_FULL(float) d;
+ const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+ float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+ // All calculations are lane-wise, still some might require
+ // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+ // vector tail.
+ msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ auto r = LoadU(d, row0 + x);
+ auto g = LoadU(d, row1 + x);
+ auto b = LoadU(d, row2 + x);
+ op_.Transform(d, &r, &g, &b);
+ StoreU(r, d, row0 + x);
+ StoreU(g, d, row1 + x);
+ StoreU(b, d, row2 + x);
+ }
+ msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "FromLinear"; }
+
+ private:
+ Op op_;
+};
+
+template <typename Op>
+std::unique_ptr<FromLinearStage<Op>> MakeFromLinearStage(Op&& op) {
+ return jxl::make_unique<FromLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ const auto& tf = output_encoding_info.color_encoding.Tf();
+ if (tf.IsLinear()) {
+ return MakeFromLinearStage(MakePerChannelOp(OpLinear()));
+ } else if (tf.IsSRGB()) {
+ return MakeFromLinearStage(MakePerChannelOp(OpRgb()));
+ } else if (tf.IsPQ()) {
+ return MakeFromLinearStage(
+ MakePerChannelOp(OpPq(output_encoding_info.orig_intensity_target)));
+ } else if (tf.IsHLG()) {
+ return MakeFromLinearStage(
+ OpHlg(output_encoding_info.luminances,
+ output_encoding_info.desired_intensity_target));
+ } else if (tf.Is709()) {
+ return MakeFromLinearStage(MakePerChannelOp(Op709()));
+ } else if (tf.have_gamma || tf.IsDCI()) {
+ return MakeFromLinearStage(
+ MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma}));
+ } else {
+ // This is a programming error.
+ JXL_UNREACHABLE("Invalid target encoding");
+ }
+}
+
+} // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetFromLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ return HWY_DYNAMIC_DISPATCH(GetFromLinearStage)(output_encoding_info);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
new file mode 100644
index 0000000000..548ab50b8c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from linear to the specified output encoding.
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+ const OutputEncodingInfo& output_encoding_info);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
new file mode 100644
index 0000000000..0917db3f9a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
@@ -0,0 +1,120 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_gaborish.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_gaborish.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class GaborishStage : public RenderPipelineStage {
+ public:
+ explicit GaborishStage(const LoopFilter& lf)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/0, /*border=*/1)) {
+ weights_[0] = 1;
+ weights_[1] = lf.gab_x_weight1;
+ weights_[2] = lf.gab_x_weight2;
+ weights_[3] = 1;
+ weights_[4] = lf.gab_y_weight1;
+ weights_[5] = lf.gab_y_weight2;
+ weights_[6] = 1;
+ weights_[7] = lf.gab_b_weight1;
+ weights_[8] = lf.gab_b_weight2;
+ // Normalize
+ for (size_t c = 0; c < 3; c++) {
+ const float div =
+ weights_[3 * c] + 4 * (weights_[3 * c + 1] + weights_[3 * c + 2]);
+ const float mul = 1.0f / div;
+ weights_[3 * c] *= mul;
+ weights_[3 * c + 1] *= mul;
+ weights_[3 * c + 2] *= mul;
+ }
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ const HWY_FULL(float) d;
+ for (size_t c = 0; c < 3; c++) {
+ float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1);
+ float* JXL_RESTRICT row_m = GetInputRow(input_rows, c, 0);
+ float* JXL_RESTRICT row_b = GetInputRow(input_rows, c, 1);
+ float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+ const auto w0 = Set(d, weights_[3 * c + 0]);
+ const auto w1 = Set(d, weights_[3 * c + 1]);
+ const auto w2 = Set(d, weights_[3 * c + 2]);
+// Group data need only be aligned to a block; for >=512 bit vectors, this may
+// result in unaligned loads.
+#if HWY_CAP_GE512
+#define LoadMaybeU LoadU
+#else
+#define LoadMaybeU Load
+#endif
+ // Since GetInputRow(input_rows, c, {-1, 0, 1}) is aligned, rounding
+ // xextra up to Lanes(d) doesn't access anything problematic.
+ for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+ x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ const auto t = LoadMaybeU(d, row_t + x);
+ const auto tl = LoadU(d, row_t + x - 1);
+ const auto tr = LoadU(d, row_t + x + 1);
+ const auto m = LoadMaybeU(d, row_m + x);
+ const auto l = LoadU(d, row_m + x - 1);
+ const auto r = LoadU(d, row_m + x + 1);
+ const auto b = LoadMaybeU(d, row_b + x);
+ const auto bl = LoadU(d, row_b + x - 1);
+ const auto br = LoadU(d, row_b + x + 1);
+ const auto sum0 = m;
+ const auto sum1 = Add(Add(l, r), Add(t, b));
+ const auto sum2 = Add(Add(tl, tr), Add(bl, br));
+ auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, Mul(sum0, w0)));
+ Store(pixels, d, row_out + x);
+ }
+ }
+ }
+#undef LoadMaybeU
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "Gab"; }
+
+ private:
+ float weights_[9];
+};
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+ return jxl::make_unique<GaborishStage>(lf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetGaborishStage);
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+ JXL_ASSERT(lf.gab == 1);
+ return HWY_DYNAMIC_DISPATCH(GetGaborishStage)(lf);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
new file mode 100644
index 0000000000..55166e3ed8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies decoder-side Gaborish with the given settings. `lf.gab` must be 1.
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf);
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
new file mode 100644
index 0000000000..5cf8a6ed51
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
@@ -0,0 +1,316 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_noise.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::TableLookupBytes;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+using D = HWY_CAPPED(float, kBlockDim);
+using DI = hwy::HWY_NAMESPACE::Rebind<int32_t, D>;
+using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
+
+// [0, max_value]
+template <class D, class V>
+static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) {
+ const auto clamped = Min(x, max_value);
+ return ZeroIfNegative(clamped);
+}
+
+// x is in [0+delta, 1+delta], delta ~= 0.06
+template <class StrengthEval>
+typename StrengthEval::V NoiseStrength(const StrengthEval& eval,
+ const typename StrengthEval::V x) {
+ return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f));
+}
+
+// TODO(veluca): SIMD-fy.
+class StrengthEvalLut {
+ public:
+ using V = Vec<D>;
+
+ explicit StrengthEvalLut(const NoiseParams& noise_params)
+#if HWY_TARGET == HWY_SCALAR
+ : noise_params_(noise_params)
+#endif
+ {
+#if HWY_TARGET != HWY_SCALAR
+ uint32_t lut[8];
+ memcpy(lut, noise_params.lut, sizeof(lut));
+ for (size_t i = 0; i < 8; i++) {
+ low16_lut[2 * i] = (lut[i] >> 0) & 0xFF;
+ low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF;
+ high16_lut[2 * i] = (lut[i] >> 16) & 0xFF;
+ high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF;
+ }
+#endif
+ }
+
+ V operator()(const V vx) const {
+ constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2;
+ auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale)));
+ auto floor_x = Floor(scaled_vx);
+ auto frac_x = Sub(scaled_vx, floor_x);
+ floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale),
+ floor_x);
+ frac_x =
+ IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x);
+ auto floor_x_int = ConvertTo(DI(), floor_x);
+#if HWY_TARGET == HWY_SCALAR
+ auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
+ auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]);
+#else
+ // Set each lane's bytes to {0, 0, 2x+1, 2x}.
+ auto floorx_indices_low =
+ Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100));
+ // Set each lane's bytes to {2x+1, 2x, 0, 0}.
+ auto floorx_indices_hi =
+ Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000));
+ // load LUT
+ auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut));
+ auto lowm = Set(DI(), 0xFFFF);
+ auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut));
+ auto him = Set(DI(), 0xFFFF0000);
+ // low = noise_params.lut[floor_x]
+ auto low =
+ BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+ And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+ // hi = noise_params.lut[floor_x+1]
+ floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202));
+ floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000));
+ auto hi =
+ BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+ And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+#endif
+ return MulAdd(Sub(hi, low), frac_x, low);
+ }
+
+ private:
+#if HWY_TARGET != HWY_SCALAR
+ // noise_params.lut transformed into two 16-bit lookup tables.
+ HWY_ALIGN uint8_t high16_lut[16];
+ HWY_ALIGN uint8_t low16_lut[16];
+#else
+ const NoiseParams& noise_params_;
+#endif
+};
+
+template <class D>
+void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r,
+ const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor,
+ const Vec<D> noise_strength_g, const Vec<D> noise_strength_r,
+ float ytox, float ytob, float* JXL_RESTRICT out_x,
+ float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) {
+ const auto kRGCorr = Set(d, 0.9921875f); // 127/128
+ const auto kRGNCorr = Set(d, 0.0078125f); // 1/128
+
+ const auto red_noise =
+ Mul(noise_strength_r,
+ MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor)));
+ const auto green_noise =
+ Mul(noise_strength_g,
+ MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor)));
+
+ auto vx = LoadU(d, out_x);
+ auto vy = LoadU(d, out_y);
+ auto vb = LoadU(d, out_b);
+
+ const auto rg_noise = Add(red_noise, green_noise);
+ vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx);
+ vy = Add(vy, rg_noise);
+ vb = MulAdd(Set(d, ytob), rg_noise, vb);
+
+ StoreU(vx, d, out_x);
+ StoreU(vy, d, out_y);
+ StoreU(vb, d, out_b);
+}
+
+class AddNoiseStage : public RenderPipelineStage {
+ public:
+ AddNoiseStage(const NoiseParams& noise_params,
+ const ColorCorrelationMap& cmap, size_t first_c)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/0, /*border=*/0)),
+ noise_params_(noise_params),
+ cmap_(cmap),
+ first_c_(first_c) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ if (!noise_params_.HasAny()) return;
+ const StrengthEvalLut noise_model(noise_params_);
+ D d;
+ const auto half = Set(d, 0.5f);
+
+ // With the prior subtract-random Laplacian approximation, rnd_* ranges were
+ // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the
+ // normalizer is half of what it was before (0.5).
+ const auto norm_const = Set(d, 0.22f);
+
+ float ytox = cmap_.YtoXRatio(0);
+ float ytob = cmap_.YtoBRatio(0);
+
+ const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+
+ float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0);
+ const float* JXL_RESTRICT row_rnd_r =
+ GetInputRow(input_rows, first_c_ + 0, 0);
+ const float* JXL_RESTRICT row_rnd_g =
+ GetInputRow(input_rows, first_c_ + 1, 0);
+ const float* JXL_RESTRICT row_rnd_c =
+ GetInputRow(input_rows, first_c_ + 2, 0);
+ // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and
+ // shuffles are otherwise done on the data, so this is safe.
+ msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+ msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+ for (size_t x = 0; x < xsize_v; x += Lanes(d)) {
+ const auto vx = LoadU(d, row_x + x);
+ const auto vy = LoadU(d, row_y + x);
+ const auto in_g = Sub(vy, vx);
+ const auto in_r = Add(vy, vx);
+ const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half));
+ const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half));
+ const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const);
+ const auto addit_rnd_noise_green =
+ Mul(LoadU(d, row_rnd_g + x), norm_const);
+ const auto addit_rnd_noise_correlated =
+ Mul(LoadU(d, row_rnd_c + x), norm_const);
+ AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green,
+ addit_rnd_noise_correlated, noise_strength_g,
+ noise_strength_r, ytox, ytob, row_x + x, row_y + x,
+ row_b + x);
+ }
+ msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+ msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+ msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float));
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c >= first_c_ ? RenderPipelineChannelMode::kInput
+ : c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "AddNoise"; }
+
+ private:
+ const NoiseParams& noise_params_;
+ const ColorCorrelationMap& cmap_;
+ size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+ const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+ size_t noise_c_start) {
+ return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start);
+}
+
+class ConvolveNoiseStage : public RenderPipelineStage {
+ public:
+ explicit ConvolveNoiseStage(size_t first_c)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/0, /*border=*/2)),
+ first_c_(first_c) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ const HWY_FULL(float) d;
+ for (size_t c = first_c_; c < first_c_ + 3; c++) {
+ float* JXL_RESTRICT rows[5];
+ for (size_t i = 0; i < 5; i++) {
+ rows[i] = GetInputRow(input_rows, c, i - 2);
+ }
+ float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+ for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+ x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ const auto p00 = LoadU(d, rows[2] + x);
+ auto others = Zero(d);
+ // TODO(eustas): sum loaded values to reduce the calculation chain
+ for (ssize_t i = -2; i <= 2; i++) {
+ others = Add(others, LoadU(d, rows[0] + x + i));
+ others = Add(others, LoadU(d, rows[1] + x + i));
+ others = Add(others, LoadU(d, rows[3] + x + i));
+ others = Add(others, LoadU(d, rows[4] + x + i));
+ }
+ others = Add(others, LoadU(d, rows[2] + x - 2));
+ others = Add(others, LoadU(d, rows[2] + x - 1));
+ others = Add(others, LoadU(d, rows[2] + x + 1));
+ others = Add(others, LoadU(d, rows[2] + x + 2));
+ // 4 * (1 - box kernel)
+ auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84)));
+ StoreU(pixels, d, row_out + x);
+ }
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c >= first_c_ ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "ConvNoise"; }
+
+ private:
+ size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+ size_t noise_c_start) {
+ return jxl::make_unique<ConvolveNoiseStage>(noise_c_start);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetAddNoiseStage);
+HWY_EXPORT(GetConvolveNoiseStage);
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+ const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+ size_t noise_c_start) {
+ return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap,
+ noise_c_start);
+}
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+ size_t noise_c_start) {
+ return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
new file mode 100644
index 0000000000..bd7797f991
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Adds noise to color channels.
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+ const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+ size_t noise_c_start);
+
+// Applies a 5x5 subtract-box-filter convolution to the noise input channels.
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+ size_t noise_c_start);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
new file mode 100644
index 0000000000..c5a75b09f7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_patches.h"
+
+namespace jxl {
+namespace {
+class PatchDictionaryStage : public RenderPipelineStage {
+ public:
+ PatchDictionaryStage(const PatchDictionary* patches, size_t num_channels)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ patches_(*patches),
+ num_channels_(num_channels) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ JXL_ASSERT(xpos == 0 || xpos >= xextra);
+ size_t x0 = xpos ? xpos - xextra : 0;
+ std::vector<float*> row_ptrs(num_channels_);
+ for (size_t i = 0; i < num_channels_; i++) {
+ row_ptrs[i] = GetInputRow(input_rows, i, 0) + x0 - xpos;
+ }
+ patches_.AddOneRow(row_ptrs.data(), ypos, x0, xsize + xextra + xpos - x0);
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < num_channels_ ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "Patches"; }
+
+ private:
+ const PatchDictionary& patches_;
+ const size_t num_channels_;
+};
+} // namespace
+
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+ const PatchDictionary* patches, size_t num_channels) {
+ return jxl::make_unique<PatchDictionaryStage>(patches, num_channels);
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
new file mode 100644
index 0000000000..b35abdc2eb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+
+#include <utility>
+
+#include "lib/jxl/patch_dictionary_internal.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Draws patches if applicable.
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+ const PatchDictionary* patches, size_t num_channels);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
new file mode 100644
index 0000000000..4a0529ce2c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
@@ -0,0 +1,62 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_splines.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_splines.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class SplineStage : public RenderPipelineStage {
+ public:
+ explicit SplineStage(const Splines* splines)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ splines_(*splines) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ float* row_x = GetInputRow(input_rows, 0, 0);
+ float* row_y = GetInputRow(input_rows, 1, 0);
+ float* row_b = GetInputRow(input_rows, 2, 0);
+ splines_.AddToRow(row_x, row_y, row_b, Rect(xpos, ypos, xsize, 1));
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "Splines"; }
+
+ private:
+ const Splines& splines_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+ return jxl::make_unique<SplineStage>(splines);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetSplineStage);
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+ return HWY_DYNAMIC_DISPATCH(GetSplineStage)(splines);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
new file mode 100644
index 0000000000..363af393ec
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Draws splines if applicable.
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
new file mode 100644
index 0000000000..a43cb4e1ab
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
@@ -0,0 +1,51 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_spot.h"
+
+namespace jxl {
+class SpotColorStage : public RenderPipelineStage {
+ public:
+ explicit SpotColorStage(size_t spot_c, const float* spot_color)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ spot_c_(spot_c),
+ spot_color_(spot_color) {
+ JXL_ASSERT(spot_c_ >= 3);
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ // TODO(veluca): add SIMD.
+ float scale = spot_color_[3];
+ for (size_t c = 0; c < 3; c++) {
+ float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0);
+ const float* JXL_RESTRICT s = GetInputRow(input_rows, spot_c_, 0);
+ for (ssize_t x = -xextra; x < ssize_t(xsize + xextra); x++) {
+ float mix = scale * s[x];
+ p[x] = mix * spot_color_[c] + (1.0f - mix) * p[x];
+ }
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : c == spot_c_ ? RenderPipelineChannelMode::kInput
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "Spot"; }
+
+ private:
+ size_t spot_c_;
+ const float* spot_color_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(
+ size_t spot_c, const float* spot_color) {
+ return jxl::make_unique<SpotColorStage>(spot_c, spot_color);
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
new file mode 100644
index 0000000000..3e79c75823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Render the spot color channels.
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(size_t spot_c,
+ const float* spot_color);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
new file mode 100644
index 0000000000..85eca2f039
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
@@ -0,0 +1,203 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_to_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_to_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+ explicit PerChannelOp(Op op) : op(op) {}
+ template <typename D, typename T>
+ void Transform(D d, T* r, T* g, T* b) const {
+ *r = op.Transform(d, *r);
+ *g = op.Transform(d, *g);
+ *b = op.Transform(d, *b);
+ }
+
+ Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+ return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+ template <typename D, typename T>
+ T Transform(D d, const T& encoded) const {
+ return encoded;
+ }
+};
+
+struct OpRgb {
+ template <typename D, typename T>
+ T Transform(D d, const T& encoded) const {
+ return TF_SRGB().DisplayFromEncoded(encoded);
+ }
+};
+
+struct OpPq {
+ explicit OpPq(const float intensity_target) : tf_pq_(intensity_target) {}
+ template <typename D, typename T>
+ T Transform(D d, const T& encoded) const {
+ return tf_pq_.DisplayFromEncoded(d, encoded);
+ }
+ TF_PQ tf_pq_;
+};
+
+struct OpHlg {
+ explicit OpHlg(const float luminances[3], const float intensity_target)
+ : hlg_ootf_(HlgOOTF::FromSceneLight(
+ /*display_luminance=*/intensity_target, luminances)) {}
+
+ template <typename D, typename T>
+ void Transform(D d, T* r, T* g, T* b) const {
+ for (T* val : {r, g, b}) {
+ HWY_ALIGN float vals[MaxLanes(d)];
+ Store(*val, d, vals);
+ for (size_t i = 0; i < Lanes(d); ++i) {
+ vals[i] = TF_HLG_Base::DisplayFromEncoded(vals[i]);
+ }
+ *val = Load(d, vals);
+ }
+ hlg_ootf_.Apply(r, g, b);
+ }
+ HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+ template <typename D, typename T>
+ T Transform(D d, const T& encoded) const {
+ return TF_709().DisplayFromEncoded(d, encoded);
+ }
+};
+
+struct OpGamma {
+ const float gamma;
+ template <typename D, typename T>
+ T Transform(D d, const T& encoded) const {
+ return IfThenZeroElse(Le(encoded, Set(d, 1e-5f)),
+ FastPowf(d, encoded, Set(d, gamma)));
+ }
+};
+
+struct OpInvalid {
+ template <typename D, typename T>
+ void Transform(D d, T* r, T* g, T* b) const {}
+};
+
+template <typename Op>
+class ToLinearStage : public RenderPipelineStage {
+ public:
+ explicit ToLinearStage(Op op)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ op_(std::move(op)) {}
+
+ explicit ToLinearStage()
+ : RenderPipelineStage(RenderPipelineStage::Settings()), valid_(false) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ const HWY_FULL(float) d;
+ const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+ float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+ // All calculations are lane-wise, still some might require
+ // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+ // vector tail.
+ msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ auto r = LoadU(d, row0 + x);
+ auto g = LoadU(d, row1 + x);
+ auto b = LoadU(d, row2 + x);
+ op_.Transform(d, &r, &g, &b);
+ StoreU(r, d, row0 + x);
+ StoreU(g, d, row1 + x);
+ StoreU(b, d, row2 + x);
+ }
+ msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "ToLinear"; }
+
+ private:
+ Status IsInitialized() const override { return valid_; }
+
+ Op op_;
+ bool valid_ = true;
+};
+
+template <typename Op>
+std::unique_ptr<ToLinearStage<Op>> MakeToLinearStage(Op&& op) {
+ return jxl::make_unique<ToLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ const auto& tf = output_encoding_info.color_encoding.Tf();
+ if (tf.IsLinear()) {
+ return MakeToLinearStage(MakePerChannelOp(OpLinear()));
+ } else if (tf.IsSRGB()) {
+ return MakeToLinearStage(MakePerChannelOp(OpRgb()));
+ } else if (tf.IsPQ()) {
+ return MakeToLinearStage(
+ MakePerChannelOp(OpPq(output_encoding_info.orig_intensity_target)));
+ } else if (tf.IsHLG()) {
+ return MakeToLinearStage(OpHlg(output_encoding_info.luminances,
+ output_encoding_info.orig_intensity_target));
+ } else if (tf.Is709()) {
+ return MakeToLinearStage(MakePerChannelOp(Op709()));
+ } else if (tf.have_gamma || tf.IsDCI()) {
+ return MakeToLinearStage(
+ MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma}));
+ } else {
+ return jxl::make_unique<ToLinearStage<OpInvalid>>();
+ }
+}
+
+} // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ return HWY_DYNAMIC_DISPATCH(GetToLinearStage)(output_encoding_info);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
new file mode 100644
index 0000000000..ccee7b09f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from `output_encoding_info.color_encoding` to
+// linear.
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+ const OutputEncodingInfo& output_encoding_info);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
new file mode 100644
index 0000000000..2a272e15dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_tone_mapping.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_tone_mapping.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class ToneMappingStage : public RenderPipelineStage {
+ public:
+ explicit ToneMappingStage(OutputEncodingInfo output_encoding_info)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ output_encoding_info_(std::move(output_encoding_info)) {
+ if (output_encoding_info_.desired_intensity_target ==
+ output_encoding_info_.orig_intensity_target) {
+ // No tone mapping requested.
+ return;
+ }
+ const auto& orig_tf = output_encoding_info_.orig_color_encoding.Tf();
+ const auto& dest_tf = output_encoding_info_.color_encoding.Tf();
+ if (orig_tf.IsPQ() && output_encoding_info_.desired_intensity_target <
+ output_encoding_info_.orig_intensity_target) {
+ tone_mapper_ = jxl::make_unique<ToneMapper>(
+ /*source_range=*/std::pair<float, float>(
+ 0, output_encoding_info_.orig_intensity_target),
+ /*target_range=*/
+ std::pair<float, float>(
+ 0, output_encoding_info_.desired_intensity_target),
+ output_encoding_info_.luminances);
+ } else if (orig_tf.IsHLG() && !dest_tf.IsHLG()) {
+ hlg_ootf_ = jxl::make_unique<HlgOOTF>(
+ /*source_luminance=*/output_encoding_info_.orig_intensity_target,
+ /*target_luminance=*/output_encoding_info_.desired_intensity_target,
+ output_encoding_info_.luminances);
+ }
+
+ if (dest_tf.IsPQ() && (tone_mapper_ || hlg_ootf_)) {
+ to_intensity_target_ =
+ 10000.f / output_encoding_info_.orig_intensity_target;
+ from_desired_intensity_target_ =
+ output_encoding_info_.desired_intensity_target / 10000.f;
+ }
+ }
+
+ bool IsNeeded() const { return tone_mapper_ || hlg_ootf_; }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ if (!(tone_mapper_ || hlg_ootf_)) return;
+
+ const HWY_FULL(float) d;
+ const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+ float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+ // All calculations are lane-wise, still some might require
+ // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+ // vector tail.
+ msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ auto r = LoadU(d, row0 + x);
+ auto g = LoadU(d, row1 + x);
+ auto b = LoadU(d, row2 + x);
+ if (tone_mapper_ || hlg_ootf_) {
+ r = Mul(r, Set(d, to_intensity_target_));
+ g = Mul(g, Set(d, to_intensity_target_));
+ b = Mul(b, Set(d, to_intensity_target_));
+ if (tone_mapper_) {
+ tone_mapper_->ToneMap(&r, &g, &b);
+ } else {
+ JXL_ASSERT(hlg_ootf_);
+ hlg_ootf_->Apply(&r, &g, &b);
+ }
+ if (tone_mapper_ || hlg_ootf_->WarrantsGamutMapping()) {
+ GamutMap(&r, &g, &b, output_encoding_info_.luminances);
+ }
+ r = Mul(r, Set(d, from_desired_intensity_target_));
+ g = Mul(g, Set(d, from_desired_intensity_target_));
+ b = Mul(b, Set(d, from_desired_intensity_target_));
+ }
+ StoreU(r, d, row0 + x);
+ StoreU(g, d, row1 + x);
+ StoreU(b, d, row2 + x);
+ }
+ msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "ToneMapping"; }
+
+ private:
+ using ToneMapper = Rec2408ToneMapper<HWY_FULL(float)>;
+ OutputEncodingInfo output_encoding_info_;
+ std::unique_ptr<ToneMapper> tone_mapper_;
+ std::unique_ptr<HlgOOTF> hlg_ootf_;
+ // When the target colorspace is PQ, 1 represents 10000 nits instead of
+ // orig_intensity_target. This temporarily changes this if the tone mappers
+ // require it.
+ float to_intensity_target_ = 1.f;
+ float from_desired_intensity_target_ = 1.f;
+};
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ auto stage = jxl::make_unique<ToneMappingStage>(output_encoding_info);
+ if (!stage->IsNeeded()) return nullptr;
+ return stage;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToneMappingStage);
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ return HWY_DYNAMIC_DISPATCH(GetToneMappingStage)(output_encoding_info);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
new file mode 100644
index 0000000000..57eb9a9abf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Tone maps the image if appropriate. It must be in linear space and
+// `output_encoding_info.luminances` must contain the luminance for the
+// primaries of that space. It must also be encoded such that (1, 1, 1)
+// represents `output_encoding_info.orig_intensity_target` nits, unless
+// `output_encoding_info.color_encoding.tf.IsPQ()`, in which case (1, 1, 1) must
+// represent 10000 nits. This corresponds to what XYBStage outputs. After this
+// stage, (1, 1, 1) will represent
+// `output_encoding_info.desired_intensity_target` nits, except in the PQ
+// special case in which it remains 10000.
+//
+// If no tone mapping is necessary, this will return nullptr.
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+ const OutputEncodingInfo& output_encoding_info);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
new file mode 100644
index 0000000000..ade37d59a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
@@ -0,0 +1,192 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+
+#include "lib/jxl/base/status.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class UpsamplingStage : public RenderPipelineStage {
+ public:
+ explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c,
+ size_t shift)
+ : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+ /*shift=*/shift, /*border=*/2)),
+ c_(c) {
+ const float* weights = shift == 1 ? ups_factors.upsampling2_weights
+ : shift == 2 ? ups_factors.upsampling4_weights
+ : ups_factors.upsampling8_weights;
+ size_t N = 1 << (shift - 1);
+ for (size_t i = 0; i < 5 * N; i++) {
+ for (size_t j = 0; j < 5 * N; j++) {
+ size_t y = std::min(i, j);
+ size_t x = std::max(i, j);
+ kernel_[j / 5][i / 5][j % 5][i % 5] =
+ weights[5 * N * y - y * (y - 1) / 2 + x - y];
+ }
+ }
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ static HWY_FULL(float) df;
+ size_t shift = settings_.shift_x;
+ size_t N = 1 << shift;
+ const size_t xsize_v = RoundUpTo(xsize, Lanes(df));
+ for (ssize_t iy = -2; iy <= 2; iy++) {
+ msan::UnpoisonMemory(GetInputRow(input_rows, c_, iy) + xsize + 2,
+ sizeof(float) * (xsize_v - xsize));
+ }
+ JXL_ASSERT(xextra == 0);
+ ssize_t x0 = 0;
+ ssize_t x1 = xsize;
+ if (N == 2) {
+ ProcessRowImpl<2>(input_rows, output_rows, x0, x1);
+ }
+ if (N == 4) {
+ ProcessRowImpl<4>(input_rows, output_rows, x0, x1);
+ }
+ if (N == 8) {
+ ProcessRowImpl<8>(input_rows, output_rows, x0, x1);
+ }
+ for (size_t oy = 0; oy < N; oy++) {
+ float* dst_row = GetOutputRow(output_rows, c_, oy);
+ msan::PoisonMemory(dst_row + xsize * N,
+ sizeof(float) * (xsize_v - xsize) * N);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c == c_ ? RenderPipelineChannelMode::kInOut
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "Upsample"; }
+
+ private:
+ template <size_t N>
+ JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const {
+ ix += 2;
+ iy += 2;
+ if (N == 2) {
+ return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix];
+ }
+ if (N == 4) {
+ return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2]
+ [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy]
+ [x % 4 < 2 ? ix : 4 - ix];
+ }
+ if (N == 8) {
+ return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4]
+ [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy]
+ [x % 8 < 4 ? ix : 4 - ix];
+ }
+ JXL_UNREACHABLE("Invalid upsample");
+ }
+
+ template <ssize_t N>
+ void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows,
+ ssize_t x0, ssize_t x1) const {
+ static HWY_FULL(float) df;
+ using V = hwy::HWY_NAMESPACE::Vec<HWY_FULL(float)>;
+ V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7;
+ (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7;
+ // Once we have C++17 available, change this back to `V* ups[N]` and
+ // initialize using `if constexpr` below.
+ V* ups[8] = {};
+ static_assert(N == 2 || N == 4 || N == 8, "N must be 2, 4, or 8");
+ if (N >= 2) {
+ ups[0] = &ups0;
+ ups[1] = &ups1;
+ }
+ if (N >= 4) {
+ ups[2] = &ups2;
+ ups[3] = &ups3;
+ }
+ if (N == 8) {
+ ups[4] = &ups4;
+ ups[5] = &ups5;
+ ups[6] = &ups6;
+ ups[7] = &ups7;
+ }
+
+ for (size_t oy = 0; oy < N; oy++) {
+ float* dst_row = GetOutputRow(output_rows, c_, oy);
+ for (ssize_t x = x0; x < x1; x += Lanes(df)) {
+ for (size_t ox = 0; ox < N; ox++) {
+ auto result = Zero(df);
+ auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x);
+ auto max = min;
+ for (ssize_t iy = -2; iy <= 2; iy++) {
+ for (ssize_t ix = -2; ix <= 2; ix++) {
+ auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix);
+ result = MulAdd(Set(df, Kernel<N>(ox, oy, ix, iy)), v, result);
+ min = Min(v, min);
+ max = Max(v, max);
+ }
+ }
+ // Avoid overshooting.
+ *ups[ox] = Clamp(result, min, max);
+ }
+ if (N == 2) {
+ StoreInterleaved(df, ups0, ups1, dst_row + x * N);
+ }
+ if (N == 4) {
+ StoreInterleaved(df, ups0, ups1, ups2, ups3, dst_row + x * N);
+ }
+ if (N == 8) {
+ StoreInterleaved(df, ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7,
+ dst_row + x * N);
+ }
+ }
+ }
+ }
+
+ size_t c_;
+ float kernel_[4][4][5][5];
+};
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+ const CustomTransformData& ups_factors, size_t c, size_t shift) {
+ return jxl::make_unique<UpsamplingStage>(ups_factors, c, shift);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+ const CustomTransformData& ups_factors, size_t c, size_t shift) {
+ JXL_ASSERT(shift != 0);
+ JXL_ASSERT(shift <= 3);
+ return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift);
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
new file mode 100644
index 0000000000..7d5defd23c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Upsamples the given channel by the given factor.
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+ const CustomTransformData& ups_factors, size_t c, size_t shift);
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
new file mode 100644
index 0000000000..847972acc8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
@@ -0,0 +1,671 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_write.h"
+
+#include <type_traits>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_write.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+using hwy::HWY_NAMESPACE::VFromD;
+
+// 8x8 ordered dithering pattern from
+// https://en.wikipedia.org/wiki/Ordered_dithering
+// scaled to have an average of 0 and be fully contained in (-0.5, 0.5).
+// Matrix is duplicated in width to avoid inconsistencies or out-of-bound-reads
+// if doing unaligned operations.
+const float kDither[(2 * 8) * 8] = {
+ -0.4921875, 0.0078125, -0.3671875, 0.1328125, //
+ -0.4609375, 0.0390625, -0.3359375, 0.1640625, //
+ -0.4921875, 0.0078125, -0.3671875, 0.1328125, //
+ -0.4609375, 0.0390625, -0.3359375, 0.1640625, //
+ //
+ 0.2578125, -0.2421875, 0.3828125, -0.1171875, //
+ 0.2890625, -0.2109375, 0.4140625, -0.0859375, //
+ 0.2578125, -0.2421875, 0.3828125, -0.1171875, //
+ 0.2890625, -0.2109375, 0.4140625, -0.0859375, //
+ //
+ -0.3046875, 0.1953125, -0.4296875, 0.0703125, //
+ -0.2734375, 0.2265625, -0.3984375, 0.1015625, //
+ -0.3046875, 0.1953125, -0.4296875, 0.0703125, //
+ -0.2734375, 0.2265625, -0.3984375, 0.1015625, //
+ //
+ 0.4453125, -0.0546875, 0.3203125, -0.1796875, //
+ 0.4765625, -0.0234375, 0.3515625, -0.1484375, //
+ 0.4453125, -0.0546875, 0.3203125, -0.1796875, //
+ 0.4765625, -0.0234375, 0.3515625, -0.1484375, //
+ //
+ -0.4453125, 0.0546875, -0.3203125, 0.1796875, //
+ -0.4765625, 0.0234375, -0.3515625, 0.1484375, //
+ -0.4453125, 0.0546875, -0.3203125, 0.1796875, //
+ -0.4765625, 0.0234375, -0.3515625, 0.1484375, //
+ //
+ 0.3046875, -0.1953125, 0.4296875, -0.0703125, //
+ 0.2734375, -0.2265625, 0.3984375, -0.1015625, //
+ 0.3046875, -0.1953125, 0.4296875, -0.0703125, //
+ 0.2734375, -0.2265625, 0.3984375, -0.1015625, //
+ //
+ -0.2578125, 0.2421875, -0.3828125, 0.1171875, //
+ -0.2890625, 0.2109375, -0.4140625, 0.0859375, //
+ -0.2578125, 0.2421875, -0.3828125, 0.1171875, //
+ -0.2890625, 0.2109375, -0.4140625, 0.0859375, //
+ //
+ 0.4921875, -0.0078125, 0.3671875, -0.1328125, //
+ 0.4609375, -0.0390625, 0.3359375, -0.1640625, //
+ 0.4921875, -0.0078125, 0.3671875, -0.1328125, //
+ 0.4609375, -0.0390625, 0.3359375, -0.1640625, //
+};
+
+using DF = HWY_FULL(float);
+
+// Converts `v` to an appropriate value for the given unsigned type.
+// If the unsigned type is an 8-bit type, performs ordered dithering.
+template <typename T>
+VFromD<Rebind<T, DF>> MakeUnsigned(VFromD<DF> v, size_t x0, size_t y0,
+ VFromD<DF> mul) {
+ static_assert(std::is_unsigned<T>::value, "T must be an unsigned type");
+ using DU = Rebind<T, DF>;
+ v = Mul(v, mul);
+ // TODO(veluca): if constexpr with C++17
+ if (sizeof(T) == 1) {
+ size_t pos = (y0 % 8) * (2 * 8) + (x0 % 8);
+#if HWY_TARGET != HWY_SCALAR
+ auto dither = LoadDup128(DF(), kDither + pos);
+#else
+ auto dither = LoadU(DF(), kDither + pos);
+#endif
+ v = Add(v, dither);
+ }
+ v = Clamp(Zero(DF()), v, mul);
+ return DemoteTo(DU(), NearestInt(v));
+}
+
+class WriteToOutputStage : public RenderPipelineStage {
+ public:
+ WriteToOutputStage(const ImageOutput& main_output, size_t width,
+ size_t height, bool has_alpha, bool unpremul_alpha,
+ size_t alpha_c, Orientation undo_orientation,
+ const std::vector<ImageOutput>& extra_output)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ width_(width),
+ height_(height),
+ main_(main_output),
+ num_color_(main_.num_channels_ < 3 ? 1 : 3),
+ want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4),
+ has_alpha_(has_alpha),
+ unpremul_alpha_(unpremul_alpha),
+ alpha_c_(alpha_c),
+ flip_x_(ShouldFlipX(undo_orientation)),
+ flip_y_(ShouldFlipY(undo_orientation)),
+ transpose_(ShouldTranspose(undo_orientation)),
+ opaque_alpha_(kMaxPixelsPerCall, 1.0f) {
+ for (size_t ec = 0; ec < extra_output.size(); ++ec) {
+ if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) {
+ Output extra(extra_output[ec]);
+ extra.channel_index_ = 3 + ec;
+ extra_channels_.push_back(extra);
+ }
+ }
+ }
+
+ WriteToOutputStage(const WriteToOutputStage&) = delete;
+ WriteToOutputStage& operator=(const WriteToOutputStage&) = delete;
+ WriteToOutputStage(WriteToOutputStage&&) = delete;
+ WriteToOutputStage& operator=(WriteToOutputStage&&) = delete;
+
+ ~WriteToOutputStage() override {
+ if (main_.run_opaque_) {
+ main_.pixel_callback_.destroy(main_.run_opaque_);
+ }
+ for (auto& extra : extra_channels_) {
+ if (extra.run_opaque_) {
+ extra.pixel_callback_.destroy(extra.run_opaque_);
+ }
+ }
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ JXL_DASSERT(xextra == 0);
+ JXL_DASSERT(main_.run_opaque_ || main_.buffer_);
+ if (ypos >= height_) return;
+ if (xpos >= width_) return;
+ if (flip_y_) {
+ ypos = height_ - 1u - ypos;
+ }
+ size_t limit = std::min(xsize, width_ - xpos);
+ for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) {
+ size_t xstart = xpos + x0;
+ size_t len = std::min<size_t>(kMaxPixelsPerCall, limit - x0);
+
+ const float* line_buffers[4];
+ for (size_t c = 0; c < num_color_; c++) {
+ line_buffers[c] = GetInputRow(input_rows, c, 0) + x0;
+ }
+ if (has_alpha_) {
+ line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0;
+ } else {
+ // opaque_alpha_ is a way to set all values to 1.0f.
+ line_buffers[num_color_] = opaque_alpha_.data();
+ }
+ if (has_alpha_ && want_alpha_ && unpremul_alpha_) {
+ UnpremulAlpha(thread_id, len, line_buffers);
+ }
+ OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers);
+ for (const auto& extra : extra_channels_) {
+ line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0;
+ OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers);
+ }
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ if (c < num_color_ || (has_alpha_ && c == alpha_c_)) {
+ return RenderPipelineChannelMode::kInput;
+ }
+ for (const auto& extra : extra_channels_) {
+ if (c == extra.channel_index_) {
+ return RenderPipelineChannelMode::kInput;
+ }
+ }
+ return RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "WritePixelCB"; }
+
+ private:
+ struct Output {
+ Output(const ImageOutput& image_out)
+ : pixel_callback_(image_out.callback),
+ buffer_(image_out.buffer),
+ buffer_size_(image_out.buffer_size),
+ stride_(image_out.stride),
+ num_channels_(image_out.format.num_channels),
+ swap_endianness_(SwapEndianness(image_out.format.endianness)),
+ data_type_(image_out.format.data_type),
+ bits_per_sample_(image_out.bits_per_sample) {}
+
+ Status PrepareForThreads(size_t num_threads) {
+ if (pixel_callback_.IsPresent()) {
+ run_opaque_ =
+ pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall);
+ JXL_RETURN_IF_ERROR(run_opaque_ != nullptr);
+ } else {
+ JXL_RETURN_IF_ERROR(buffer_ != nullptr);
+ }
+ return true;
+ }
+
+ PixelCallback pixel_callback_;
+ void* run_opaque_ = nullptr;
+ void* buffer_ = nullptr;
+ size_t buffer_size_;
+ size_t stride_;
+ size_t num_channels_;
+ bool swap_endianness_;
+ JxlDataType data_type_;
+ size_t bits_per_sample_;
+ size_t channel_index_; // used for extra_channels
+ };
+
+ Status PrepareForThreads(size_t num_threads) override {
+ JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads));
+ for (auto& extra : extra_channels_) {
+ JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads));
+ }
+ temp_out_.resize(num_threads);
+ for (CacheAlignedUniquePtr& temp : temp_out_) {
+ temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall *
+ main_.num_channels_);
+ }
+ if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) {
+ temp_in_.resize(num_threads * main_.num_channels_);
+ for (CacheAlignedUniquePtr& temp : temp_in_) {
+ temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall);
+ }
+ }
+ return true;
+ }
+ static bool ShouldFlipX(Orientation undo_orientation) {
+ return (undo_orientation == Orientation::kFlipHorizontal ||
+ undo_orientation == Orientation::kRotate180 ||
+ undo_orientation == Orientation::kRotate270 ||
+ undo_orientation == Orientation::kAntiTranspose);
+ }
+ static bool ShouldFlipY(Orientation undo_orientation) {
+ return (undo_orientation == Orientation::kFlipVertical ||
+ undo_orientation == Orientation::kRotate180 ||
+ undo_orientation == Orientation::kRotate90 ||
+ undo_orientation == Orientation::kAntiTranspose);
+ }
+ static bool ShouldTranspose(Orientation undo_orientation) {
+ return (undo_orientation == Orientation::kTranspose ||
+ undo_orientation == Orientation::kRotate90 ||
+ undo_orientation == Orientation::kRotate270 ||
+ undo_orientation == Orientation::kAntiTranspose);
+ }
+
+ void UnpremulAlpha(size_t thread_id, size_t len,
+ const float** line_buffers) const {
+ const HWY_FULL(float) d;
+ auto one = Set(d, 1.0f);
+ float* temp_in[4];
+ for (size_t c = 0; c < main_.num_channels_; ++c) {
+ size_t tix = thread_id * main_.num_channels_ + c;
+ temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+ memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+ }
+ auto small_alpha = Set(d, kSmallAlpha);
+ for (size_t ix = 0; ix < len; ix += Lanes(d)) {
+ auto alpha = LoadU(d, temp_in[num_color_] + ix);
+ auto mul = Div(one, Max(small_alpha, alpha));
+ for (size_t c = 0; c < num_color_; ++c) {
+ auto val = LoadU(d, temp_in[c] + ix);
+ StoreU(Mul(val, mul), d, temp_in[c] + ix);
+ }
+ }
+ for (size_t c = 0; c < main_.num_channels_; ++c) {
+ line_buffers[c] = temp_in[c];
+ }
+ }
+
+ void OutputBuffers(const Output& out, size_t thread_id, size_t ypos,
+ size_t xstart, size_t len, const float* input[4]) const {
+ if (flip_x_) {
+ FlipX(out, thread_id, len, &xstart, input);
+ }
+ if (out.data_type_ == JXL_TYPE_UINT8) {
+ uint8_t* JXL_RESTRICT temp =
+ reinterpret_cast<uint8_t*>(temp_out_[thread_id].get());
+ StoreUnsignedRow(out, input, len, temp, xstart, ypos);
+ WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+ } else if (out.data_type_ == JXL_TYPE_UINT16 ||
+ out.data_type_ == JXL_TYPE_FLOAT16) {
+ uint16_t* JXL_RESTRICT temp =
+ reinterpret_cast<uint16_t*>(temp_out_[thread_id].get());
+ if (out.data_type_ == JXL_TYPE_UINT16) {
+ StoreUnsignedRow(out, input, len, temp, xstart, ypos);
+ } else {
+ StoreFloat16Row(out, input, len, temp);
+ }
+ if (out.swap_endianness_) {
+ const HWY_FULL(uint16_t) du;
+ size_t output_len = len * out.num_channels_;
+ for (size_t j = 0; j < output_len; j += Lanes(du)) {
+ auto v = LoadU(du, temp + j);
+ auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+ StoreU(vswap, du, temp + j);
+ }
+ }
+ WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+ } else if (out.data_type_ == JXL_TYPE_FLOAT) {
+ float* JXL_RESTRICT temp =
+ reinterpret_cast<float*>(temp_out_[thread_id].get());
+ StoreFloatRow(out, input, len, temp);
+ if (out.swap_endianness_) {
+ size_t output_len = len * out.num_channels_;
+ for (size_t j = 0; j < output_len; ++j) {
+ temp[j] = BSwapFloat(temp[j]);
+ }
+ }
+ WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+ }
+ }
+
+ void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart,
+ const float** line_buffers) const {
+ float* temp_in[4];
+ for (size_t c = 0; c < out.num_channels_; ++c) {
+ size_t tix = thread_id * main_.num_channels_ + c;
+ temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+ if (temp_in[c] != line_buffers[c]) {
+ memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+ }
+ }
+ size_t last = (len - 1u);
+ size_t num = (len / 2);
+ for (size_t i = 0; i < num; ++i) {
+ for (size_t c = 0; c < out.num_channels_; ++c) {
+ std::swap(temp_in[c][i], temp_in[c][last - i]);
+ }
+ }
+ for (size_t c = 0; c < out.num_channels_; ++c) {
+ line_buffers[c] = temp_in[c];
+ }
+ *xstart = width_ - *xstart - len;
+ }
+
+ template <typename T>
+ void StoreUnsignedRow(const Output& out, const float* input[4], size_t len,
+ T* output, size_t xstart, size_t ypos) const {
+ const HWY_FULL(float) d;
+ auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1);
+ const Rebind<T, decltype(d)> du;
+ const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+ for (size_t c = 0; c < out.num_channels_; ++c) {
+ msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+ }
+ if (out.num_channels_ == 1) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreU(MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+ du, &output[i]);
+ }
+ } else if (out.num_channels_ == 2) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreInterleaved2(
+ MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+ MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul), du,
+ &output[2 * i]);
+ }
+ } else if (out.num_channels_ == 3) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreInterleaved3(
+ MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+ MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul),
+ MakeUnsigned<T>(LoadU(d, &input[2][i]), xstart + i, ypos, mul), du,
+ &output[3 * i]);
+ }
+ } else if (out.num_channels_ == 4) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreInterleaved4(
+ MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+ MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul),
+ MakeUnsigned<T>(LoadU(d, &input[2][i]), xstart + i, ypos, mul),
+ MakeUnsigned<T>(LoadU(d, &input[3][i]), xstart + i, ypos, mul), du,
+ &output[4 * i]);
+ }
+ }
+ msan::PoisonMemory(output + out.num_channels_ * len,
+ sizeof(output[0]) * out.num_channels_ * padding);
+ }
+
+ void StoreFloat16Row(const Output& out, const float* input[4], size_t len,
+ uint16_t* output) const {
+ const HWY_FULL(float) d;
+ const Rebind<uint16_t, decltype(d)> du;
+ const Rebind<hwy::float16_t, decltype(d)> df16;
+ const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+ for (size_t c = 0; c < out.num_channels_; ++c) {
+ msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+ }
+ if (out.num_channels_ == 1) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ auto v0 = LoadU(d, &input[0][i]);
+ StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]);
+ }
+ } else if (out.num_channels_ == 2) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ auto v0 = LoadU(d, &input[0][i]);
+ auto v1 = LoadU(d, &input[1][i]);
+ StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)),
+ BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]);
+ }
+ } else if (out.num_channels_ == 3) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ auto v0 = LoadU(d, &input[0][i]);
+ auto v1 = LoadU(d, &input[1][i]);
+ auto v2 = LoadU(d, &input[2][i]);
+ StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)),
+ BitCast(du, DemoteTo(df16, v1)),
+ BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]);
+ }
+ } else if (out.num_channels_ == 4) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ auto v0 = LoadU(d, &input[0][i]);
+ auto v1 = LoadU(d, &input[1][i]);
+ auto v2 = LoadU(d, &input[2][i]);
+ auto v3 = LoadU(d, &input[3][i]);
+ StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)),
+ BitCast(du, DemoteTo(df16, v1)),
+ BitCast(du, DemoteTo(df16, v2)),
+ BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]);
+ }
+ }
+ msan::PoisonMemory(output + out.num_channels_ * len,
+ sizeof(output[0]) * out.num_channels_ * padding);
+ }
+
+ void StoreFloatRow(const Output& out, const float* input[4], size_t len,
+ float* output) const {
+ const HWY_FULL(float) d;
+ if (out.num_channels_ == 1) {
+ memcpy(output, input[0], len * sizeof(output[0]));
+ } else if (out.num_channels_ == 2) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d,
+ &output[2 * i]);
+ }
+ } else if (out.num_channels_ == 3) {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+ LoadU(d, &input[2][i]), d, &output[3 * i]);
+ }
+ } else {
+ for (size_t i = 0; i < len; i += Lanes(d)) {
+ StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+ LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d,
+ &output[4 * i]);
+ }
+ }
+ }
+
+ template <typename T>
+ void WriteToOutput(const Output& out, size_t thread_id, size_t ypos,
+ size_t xstart, size_t len, T* output) const {
+ if (transpose_) {
+ // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD.
+ if (out.run_opaque_) {
+ for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+ out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i,
+ 1, output + j);
+ }
+ } else {
+ const size_t pixel_stride = out.num_channels_ * sizeof(T);
+ const size_t offset = xstart * out.stride_ + ypos * pixel_stride;
+ for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+ const size_t ix = offset + i * out.stride_;
+ JXL_DASSERT(ix + pixel_stride <= out.buffer_size_);
+ memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + ix, output + j,
+ pixel_stride);
+ }
+ }
+ } else {
+ if (out.run_opaque_) {
+ out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len,
+ output);
+ } else {
+ const size_t pixel_stride = out.num_channels_ * sizeof(T);
+ const size_t offset = ypos * out.stride_ + xstart * pixel_stride;
+ JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_);
+ memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + offset, output,
+ len * pixel_stride);
+ }
+ }
+ }
+
+ static constexpr size_t kMaxPixelsPerCall = 1024;
+ size_t width_;
+ size_t height_;
+ Output main_; // color + alpha
+ size_t num_color_;
+ bool want_alpha_;
+ bool has_alpha_;
+ bool unpremul_alpha_;
+ size_t alpha_c_;
+ bool flip_x_;
+ bool flip_y_;
+ bool transpose_;
+ std::vector<Output> extra_channels_;
+ std::vector<float> opaque_alpha_;
+ std::vector<CacheAlignedUniquePtr> temp_in_;
+ std::vector<CacheAlignedUniquePtr> temp_out_;
+};
+
+constexpr size_t WriteToOutputStage::kMaxPixelsPerCall;
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+ const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+ bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+ std::vector<ImageOutput>& extra_output) {
+ return jxl::make_unique<WriteToOutputStage>(
+ main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+ undo_orientation, extra_output);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+
+HWY_EXPORT(GetWriteToOutputStage);
+
+namespace {
+class WriteToImageBundleStage : public RenderPipelineStage {
+ public:
+ explicit WriteToImageBundleStage(ImageBundle* image_bundle,
+ ColorEncoding color_encoding)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ image_bundle_(image_bundle),
+ color_encoding_(std::move(color_encoding)) {}
+
+ void SetInputSizes(
+ const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+ JXL_ASSERT(input_sizes.size() >= 3);
+ for (size_t c = 1; c < input_sizes.size(); c++) {
+ JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+ JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+ }
+#endif
+ // TODO(eustas): what should we do in the case of "want only ECs"?
+ image_bundle_->SetFromImage(
+ Image3F(input_sizes[0].first, input_sizes[0].second), color_encoding_);
+ // TODO(veluca): consider not reallocating ECs if not needed.
+ image_bundle_->extra_channels().clear();
+ for (size_t c = 3; c < input_sizes.size(); c++) {
+ image_bundle_->extra_channels().emplace_back(input_sizes[c].first,
+ input_sizes[c].second);
+ }
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ for (size_t c = 0; c < 3; c++) {
+ memcpy(image_bundle_->color()->PlaneRow(c, ypos) + xpos - xextra,
+ GetInputRow(input_rows, c, 0) - xextra,
+ sizeof(float) * (xsize + 2 * xextra));
+ }
+ for (size_t ec = 0; ec < image_bundle_->extra_channels().size(); ec++) {
+ JXL_ASSERT(image_bundle_->extra_channels()[ec].xsize() >=
+ xpos + xsize + xextra);
+ memcpy(image_bundle_->extra_channels()[ec].Row(ypos) + xpos - xextra,
+ GetInputRow(input_rows, 3 + ec, 0) - xextra,
+ sizeof(float) * (xsize + 2 * xextra));
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return RenderPipelineChannelMode::kInput;
+ }
+
+ const char* GetName() const override { return "WriteIB"; }
+
+ private:
+ ImageBundle* image_bundle_;
+ ColorEncoding color_encoding_;
+};
+
+class WriteToImage3FStage : public RenderPipelineStage {
+ public:
+ explicit WriteToImage3FStage(Image3F* image)
+ : RenderPipelineStage(RenderPipelineStage::Settings()), image_(image) {}
+
+ void SetInputSizes(
+ const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+ JXL_ASSERT(input_sizes.size() >= 3);
+ for (size_t c = 1; c < 3; ++c) {
+ JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+ JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+ }
+#endif
+ *image_ = Image3F(input_sizes[0].first, input_sizes[0].second);
+ }
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ for (size_t c = 0; c < 3; c++) {
+ memcpy(image_->PlaneRow(c, ypos) + xpos - xextra,
+ GetInputRow(input_rows, c, 0) - xextra,
+ sizeof(float) * (xsize + 2 * xextra));
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInput
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "WriteI3F"; }
+
+ private:
+ Image3F* image_;
+};
+
+} // namespace
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+ ImageBundle* image_bundle, ColorEncoding color_encoding) {
+ return jxl::make_unique<WriteToImageBundleStage>(image_bundle,
+ std::move(color_encoding));
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image) {
+ return jxl::make_unique<WriteToImage3FStage>(image);
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+ const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+ bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+ std::vector<ImageOutput>& extra_output) {
+ return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)(
+ main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+ undo_orientation, extra_output);
+}
+
+} // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
new file mode 100644
index 0000000000..c5f844ebe8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+
+#include <functional>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+ ImageBundle* image_bundle, ColorEncoding color_encoding);
+
+// Gets a stage to write color channels to an Image3F.
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image);
+
+// Gets a stage to write to a pixel callback or image buffer.
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+ const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+ bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+ std::vector<ImageOutput>& extra_output);
+
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
new file mode 100644
index 0000000000..56e86e6095
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
@@ -0,0 +1,178 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_xyb.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/cms/opsin_params.h"
+#include "lib/jxl/common.h" // JXL_HIGH_PRECISION
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class XYBStage : public RenderPipelineStage {
+ public:
+ explicit XYBStage(const OutputEncodingInfo& output_encoding_info)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ opsin_params_(output_encoding_info.opsin_params),
+ output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() ==
+ ColorSpace::kXYB) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ const HWY_FULL(float) d;
+ JXL_ASSERT(xextra == 0);
+ const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+ float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+ // All calculations are lane-wise, still some might require
+ // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+ // vector tail.
+ msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ // TODO(eustas): when using frame origin, addresses might be unaligned;
+ // making them aligned will void performance penalty.
+ if (output_is_xyb_) {
+ const auto scale_x = Set(d, jxl::cms::kScaledXYBScale[0]);
+ const auto scale_y = Set(d, jxl::cms::kScaledXYBScale[1]);
+ const auto scale_bmy = Set(d, jxl::cms::kScaledXYBScale[2]);
+ const auto offset_x = Set(d, jxl::cms::kScaledXYBOffset[0]);
+ const auto offset_y = Set(d, jxl::cms::kScaledXYBOffset[1]);
+ const auto offset_bmy = Set(d, jxl::cms::kScaledXYBOffset[2]);
+ for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ const auto in_x = LoadU(d, row0 + x);
+ const auto in_y = LoadU(d, row1 + x);
+ const auto in_b = LoadU(d, row2 + x);
+ auto out_x = Mul(Add(in_x, offset_x), scale_x);
+ auto out_y = Mul(Add(in_y, offset_y), scale_y);
+ auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy);
+ StoreU(out_x, d, row0 + x);
+ StoreU(out_y, d, row1 + x);
+ StoreU(out_b, d, row2 + x);
+ }
+ } else {
+ for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+ const auto in_opsin_x = LoadU(d, row0 + x);
+ const auto in_opsin_y = LoadU(d, row1 + x);
+ const auto in_opsin_b = LoadU(d, row2 + x);
+ auto r = Undefined(d);
+ auto g = Undefined(d);
+ auto b = Undefined(d);
+ XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g,
+ &b);
+ StoreU(r, d, row0 + x);
+ StoreU(g, d, row1 + x);
+ StoreU(b, d, row2 + x);
+ }
+ }
+ msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+ msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "XYB"; }
+
+ private:
+ const OpsinParams opsin_params_;
+ const bool output_is_xyb_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ return jxl::make_unique<XYBStage>(output_encoding_info);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetXYBStage);
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+ const OutputEncodingInfo& output_encoding_info) {
+ return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info);
+}
+
+#if !JXL_HIGH_PRECISION
+namespace {
+class FastXYBStage : public RenderPipelineStage {
+ public:
+ FastXYBStage(uint8_t* rgb, size_t stride, size_t width, size_t height,
+ bool rgba, bool has_alpha, size_t alpha_c)
+ : RenderPipelineStage(RenderPipelineStage::Settings()),
+ rgb_(rgb),
+ stride_(stride),
+ width_(width),
+ height_(height),
+ rgba_(rgba),
+ has_alpha_(has_alpha),
+ alpha_c_(alpha_c) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ if (ypos >= height_) return;
+ JXL_ASSERT(xextra == 0);
+ const float* xyba[4] = {
+ GetInputRow(input_rows, 0, 0), GetInputRow(input_rows, 1, 0),
+ GetInputRow(input_rows, 2, 0),
+ has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr};
+ uint8_t* out_buf = rgb_ + stride_ * ypos + (rgba_ ? 4 : 3) * xpos;
+ FastXYBTosRGB8(xyba, out_buf, rgba_,
+ xsize + xpos <= width_ ? xsize : width_ - xpos);
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 || (has_alpha_ && c == alpha_c_)
+ ? RenderPipelineChannelMode::kInput
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "FastXYB"; }
+
+ private:
+ uint8_t* rgb_;
+ size_t stride_;
+ size_t width_;
+ size_t height_;
+ bool rgba_;
+ bool has_alpha_;
+ size_t alpha_c_;
+ std::vector<float> opaque_alpha_;
+};
+
+} // namespace
+
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+ uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+ bool has_alpha, size_t alpha_c) {
+ JXL_ASSERT(HasFastXYBTosRGB8());
+ return make_unique<FastXYBStage>(rgb, stride, width, height, rgba, has_alpha,
+ alpha_c);
+}
+#endif
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
new file mode 100644
index 0000000000..7b06345c36
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#include <stdint.h>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from XYB to linear with appropriate primaries.
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+ const OutputEncodingInfo& output_encoding_info);
+
+// Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and
+// write to a uint8 buffer.
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+ uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+ bool has_alpha, size_t alpha_c);
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
new file mode 100644
index 0000000000..30ad327221
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
@@ -0,0 +1,83 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_ycbcr.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class kYCbCrStage : public RenderPipelineStage {
+ public:
+ kYCbCrStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ const HWY_FULL(float) df;
+
+ // Full-range BT.601 as defined by JFIF Clause 7:
+ // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+ const auto c128 = Set(df, 128.0f / 255);
+ const auto crcr = Set(df, 1.402f);
+ const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+ const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+ const auto cbcb = Set(df, 1.772f);
+
+ float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+ float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+ float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+ // TODO(eustas): when using frame origin, addresses might be unaligned;
+ // making them aligned will void performance penalty.
+ for (size_t x = 0; x < xsize; x += Lanes(df)) {
+ const auto y_vec = Add(LoadU(df, row1 + x), c128);
+ const auto cb_vec = LoadU(df, row0 + x);
+ const auto cr_vec = LoadU(df, row2 + x);
+ const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+ const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+ const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+ StoreU(r_vec, df, row0 + x);
+ StoreU(g_vec, df, row1 + x);
+ StoreU(b_vec, df, row2 + x);
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return c < 3 ? RenderPipelineChannelMode::kInPlace
+ : RenderPipelineChannelMode::kIgnored;
+ }
+
+ const char* GetName() const override { return "YCbCr"; }
+};
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+ return jxl::make_unique<kYCbCrStage>();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetYCbCrStage);
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+ return HWY_DYNAMIC_DISPATCH(GetYCbCrStage)();
+}
+
+} // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
new file mode 100644
index 0000000000..3e99af7a38
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from YCbCr to RGB.
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage();
+} // namespace jxl
+
+#endif // LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
new file mode 100644
index 0000000000..789a52f8b2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+class UpsampleXSlowStage : public RenderPipelineStage {
+ public:
+ UpsampleXSlowStage()
+ : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(1, 1)) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ for (size_t c = 0; c < input_rows.size(); c++) {
+ const float* row = GetInputRow(input_rows, c, 0);
+ float* row_out = GetOutputRow(output_rows, c, 0);
+ for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+ float xp = *(row + x - 1);
+ float xc = *(row + x);
+ float xn = *(row + x + 1);
+ float xout0 = xp * 0.25f + xc * 0.75f;
+ float xout1 = xc * 0.75f + xn * 0.25f;
+ *(row_out + 2 * x + 0) = xout0;
+ *(row_out + 2 * x + 1) = xout1;
+ }
+ }
+ }
+
+ const char* GetName() const override { return "TEST::UpsampleXSlowStage"; }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return RenderPipelineChannelMode::kInOut;
+ }
+};
+
+class UpsampleYSlowStage : public RenderPipelineStage {
+ public:
+ UpsampleYSlowStage()
+ : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(1, 1)) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ for (size_t c = 0; c < input_rows.size(); c++) {
+ const float* rowp = GetInputRow(input_rows, c, -1);
+ const float* rowc = GetInputRow(input_rows, c, 0);
+ const float* rown = GetInputRow(input_rows, c, 1);
+ float* row_out0 = GetOutputRow(output_rows, c, 0);
+ float* row_out1 = GetOutputRow(output_rows, c, 1);
+ for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+ float xp = *(rowp + x);
+ float xc = *(rowc + x);
+ float xn = *(rown + x);
+ float yout0 = xp * 0.25f + xc * 0.75f;
+ float yout1 = xc * 0.75f + xn * 0.25f;
+ *(row_out0 + x) = yout0;
+ *(row_out1 + x) = yout1;
+ }
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return RenderPipelineChannelMode::kInOut;
+ }
+
+ const char* GetName() const override { return "TEST::UpsampleYSlowStage"; }
+};
+
+class Check0FinalStage : public RenderPipelineStage {
+ public:
+ Check0FinalStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+ void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+ size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+ size_t thread_id) const final {
+ for (size_t c = 0; c < input_rows.size(); c++) {
+ for (size_t x = 0; x < xsize; x++) {
+ JXL_CHECK(fabsf(GetInputRow(input_rows, c, 0)[x]) < 1e-8);
+ }
+ }
+ }
+
+ RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+ return RenderPipelineChannelMode::kInput;
+ }
+ const char* GetName() const override { return "TEST::Check0FinalStage"; }
+};
+
+} // namespace jxl