From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 21:33:14 +0200
Subject: Adding upstream version 115.7.0esr.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 .../render_pipeline/low_memory_render_pipeline.cc  | 865 +++++++++++++++++++++
 .../render_pipeline/low_memory_render_pipeline.h   | 111 +++
 .../lib/jxl/render_pipeline/render_pipeline.cc     | 132 ++++
 .../lib/jxl/render_pipeline/render_pipeline.h      | 139 ++++
 .../jxl/render_pipeline/render_pipeline_stage.h    | 171 ++++
 .../jxl/render_pipeline/render_pipeline_test.cc    | 562 +++++++++++++
 .../jxl/render_pipeline/simple_render_pipeline.cc  | 266 +++++++
 .../jxl/render_pipeline/simple_render_pipeline.h   |  37 +
 .../lib/jxl/render_pipeline/stage_blending.cc      | 247 ++++++
 .../lib/jxl/render_pipeline/stage_blending.h       |  24 +
 .../jxl/render_pipeline/stage_chroma_upsampling.cc | 129 +++
 .../jxl/render_pipeline/stage_chroma_upsampling.h  |  27 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc   | 524 +++++++++++++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_epf.h    |  31 +
 .../lib/jxl/render_pipeline/stage_from_linear.cc   | 191 +++++
 .../lib/jxl/render_pipeline/stage_from_linear.h    |  20 +
 .../lib/jxl/render_pipeline/stage_gaborish.cc      | 122 +++
 .../lib/jxl/render_pipeline/stage_gaborish.h       |  25 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc | 311 ++++++++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.h  |  32 +
 .../lib/jxl/render_pipeline/stage_patches.cc       |  48 ++
 .../lib/jxl/render_pipeline/stage_patches.h        |  22 +
 .../lib/jxl/render_pipeline/stage_splines.cc       |  63 ++
 .../lib/jxl/render_pipeline/stage_splines.h        |  21 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc  |  52 ++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_spot.h   |  21 +
 .../lib/jxl/render_pipeline/stage_to_linear.cc     | 202 +++++
 .../lib/jxl/render_pipeline/stage_to_linear.h      |  21 +
 .../lib/jxl/render_pipeline/stage_tone_mapping.cc  | 151 ++++
 .../lib/jxl/render_pipeline/stage_tone_mapping.h   |  37 +
 .../lib/jxl/render_pipeline/stage_upsampling.cc    | 187 +++++
 .../lib/jxl/render_pipeline/stage_upsampling.h     |  26 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_write.cc | 601 ++++++++++++++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_write.h  |  31 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc   | 176 +++++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h    |  26 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc |  85 ++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h  |  25 +
 .../render_pipeline/test_render_pipeline_stages.h  | 101 +++
 39 files changed, 5862 insertions(+)
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h

(limited to 'third_party/jpeg-xl/lib/jxl/render_pipeline')

diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
new file mode 100644
index 0000000000..db60a458db
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
@@ -0,0 +1,865 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+
+#include <algorithm>
+#include <queue>
+#include <tuple>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+std::pair<size_t, size_t>
+LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions(
+    std::pair<size_t, size_t> in, size_t c, size_t stage) const {
+  std::pair<size_t, size_t> ret;
+  std::pair<size_t, size_t> shift = channel_shifts_[stage][c];
+  ret.first =
+      ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first;
+  ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >>
+               shift.second;
+  return ret;
+}
+
+std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore(
+    size_t c) const {
+  auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0);
+  ret.first += padding_[0][c].first;
+  ret.second += padding_[0][c].second;
+  return ret;
+}
+
+void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c,
+                                          const ImageF& in) {
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+  size_t hshift = channel_shifts_[0][c].first;
+  size_t vshift = channel_shifts_[0][c].second;
+  size_t x0 = gx * GroupInputXSize(c);
+  size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+                       DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+  size_t y0 = gy * GroupInputYSize(c);
+  size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+                       DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  auto borders = BorderToStore(c);
+  size_t borderx_write = borders.first;
+  size_t bordery_write = borders.second;
+
+  if (gy > 0) {
+    Rect from(group_data_x_border_, group_data_y_border_, x1 - x0,
+              bordery_write);
+    Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write);
+    CopyImageTo(from, in, to, &borders_horizontal_[c]);
+  }
+  if (gy + 1 < frame_dimensions_.ysize_groups) {
+    Rect from(group_data_x_border_,
+              group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0,
+              bordery_write);
+    Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write);
+    CopyImageTo(from, in, to, &borders_horizontal_[c]);
+  }
+  if (gx > 0) {
+    Rect from(group_data_x_border_, group_data_y_border_, borderx_write,
+              y1 - y0);
+    Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0);
+    CopyImageTo(from, in, to, &borders_vertical_[c]);
+  }
+  if (gx + 1 < frame_dimensions_.xsize_groups) {
+    Rect from(group_data_x_border_ + x1 - x0 - borderx_write,
+              group_data_y_border_, borderx_write, y1 - y0);
+    Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0);
+    CopyImageTo(from, in, to, &borders_vertical_[c]);
+  }
+}
+
+void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c,
+                                          const Rect& r, ImageF* out) {
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+  size_t hshift = channel_shifts_[0][c].first;
+  size_t vshift = channel_shifts_[0][c].second;
+  // Coordinates of the group in the image.
+  size_t x0 = gx * GroupInputXSize(c);
+  size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+                       DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+  size_t y0 = gy * GroupInputYSize(c);
+  size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+                       DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  size_t paddingx = padding_[0][c].first;
+  size_t paddingy = padding_[0][c].second;
+
+  auto borders = BorderToStore(c);
+  size_t borderx_write = borders.first;
+  size_t bordery_write = borders.second;
+
+  // Limits of the area to copy from, in image coordinates.
+  JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx);
+  size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift);
+  if (x0src != 0) {
+    x0src -= paddingx;
+  }
+  // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the
+  // right side of the image, so we use min() here.
+  size_t x1src =
+      DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift);
+  x1src = std::min(x1src + paddingx,
+                   DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+
+  // Similar computation for y.
+  JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy);
+  size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift);
+  if (y0src != 0) {
+    y0src -= paddingy;
+  }
+  size_t y1src =
+      DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift);
+  y1src = std::min(y1src + paddingy,
+                   DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  // Copy other groups' borders from the border storage.
+  if (y0src < y0) {
+    JXL_DASSERT(gy > 0);
+    CopyImageTo(
+        Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write),
+        borders_horizontal_[c],
+        Rect(group_data_x_border_ + x0src - x0,
+             group_data_y_border_ - bordery_write, x1src - x0src,
+             bordery_write),
+        out);
+  }
+  if (y1src > y1) {
+    // When copying the bottom border we must not be on the bottom groups.
+    JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups);
+    CopyImageTo(
+        Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write),
+        borders_horizontal_[c],
+        Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0,
+             x1src - x0src, bordery_write),
+        out);
+  }
+  if (x0src < x0) {
+    JXL_DASSERT(gx > 0);
+    CopyImageTo(
+        Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src),
+        borders_vertical_[c],
+        Rect(group_data_x_border_ - borderx_write,
+             group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src),
+        out);
+  }
+  if (x1src > x1) {
+    // When copying the right border we must not be on the rightmost groups.
+    JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups);
+    CopyImageTo(
+        Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src),
+        borders_vertical_[c],
+        Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0,
+             borderx_write, y1src - y0src),
+        out);
+  }
+}
+
+size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const {
+  return (frame_dimensions_.group_dim << base_color_shift_) >>
+         channel_shifts_[0][c].first;
+}
+
+size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const {
+  return (frame_dimensions_.group_dim << base_color_shift_) >>
+         channel_shifts_[0][c].second;
+}
+
+void LowMemoryRenderPipeline::EnsureBordersStorage() {
+  const auto& shifts = channel_shifts_[0];
+  if (borders_horizontal_.size() < shifts.size()) {
+    borders_horizontal_.resize(shifts.size());
+    borders_vertical_.resize(shifts.size());
+  }
+  for (size_t c = 0; c < shifts.size(); c++) {
+    auto borders = BorderToStore(c);
+    size_t borderx = borders.first;
+    size_t bordery = borders.second;
+    JXL_DASSERT(frame_dimensions_.xsize_groups > 0);
+    size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2;
+    JXL_DASSERT(frame_dimensions_.ysize_groups > 0);
+    size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2;
+    size_t downsampled_xsize =
+        DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first);
+    size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded,
+                                       1 << shifts[c].second);
+    Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders);
+    if (!SameSize(horizontal, borders_horizontal_[c])) {
+      borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize());
+    }
+    Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize);
+    if (!SameSize(vertical, borders_vertical_[c])) {
+      borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize());
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::Init() {
+  group_border_ = {0, 0};
+  base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+                                      frame_dimensions_.xsize_padded);
+
+  const auto& shifts = channel_shifts_[0];
+
+  // Ensure that each channel has enough many border pixels.
+  for (size_t c = 0; c < shifts.size(); c++) {
+    group_border_.first =
+        std::max(group_border_.first,
+                 DivCeil(padding_[0][c].first << channel_shifts_[0][c].first,
+                         1 << base_color_shift_));
+    group_border_.second =
+        std::max(group_border_.second,
+                 DivCeil(padding_[0][c].second << channel_shifts_[0][c].second,
+                         1 << base_color_shift_));
+  }
+
+  // Ensure that all channels have an integer number of border pixels in the
+  // input.
+  for (size_t c = 0; c < shifts.size(); c++) {
+    if (channel_shifts_[0][c].first >= base_color_shift_) {
+      group_border_.first =
+          RoundUpTo(group_border_.first,
+                    1 << (channel_shifts_[0][c].first - base_color_shift_));
+    }
+    if (channel_shifts_[0][c].second >= base_color_shift_) {
+      group_border_.second =
+          RoundUpTo(group_border_.second,
+                    1 << (channel_shifts_[0][c].second - base_color_shift_));
+    }
+  }
+  // Ensure that the X border on color channels is a multiple of kBlockDim or
+  // the vector size (required for EPF stages). Vectors on ARM NEON are never
+  // wider than 4 floats, so rounding to multiples of 4 is enough.
+#if JXL_ARCH_ARM
+  constexpr size_t kGroupXAlign = 4;
+#else
+  constexpr size_t kGroupXAlign = 16;
+#endif
+  group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign);
+  // Allocate borders in group images that are just enough for storing the
+  // borders to be copied in, plus any rounding to ensure alignment.
+  std::pair<size_t, size_t> max_border = {0, 0};
+  for (size_t c = 0; c < shifts.size(); c++) {
+    max_border.first = std::max(BorderToStore(c).first, max_border.first);
+    max_border.second = std::max(BorderToStore(c).second, max_border.second);
+  }
+  group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign);
+  group_data_y_border_ = max_border.second;
+
+  EnsureBordersStorage();
+  group_border_assigner_.Init(frame_dimensions_);
+
+  for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0;
+       first_trailing_stage_--) {
+    bool has_inout_c = false;
+    for (size_t c = 0; c < shifts.size(); c++) {
+      if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) ==
+          RenderPipelineChannelMode::kInOut) {
+        has_inout_c = true;
+      }
+    }
+    if (has_inout_c) {
+      break;
+    }
+  }
+
+  first_image_dim_stage_ = stages_.size();
+  for (size_t i = 0; i < stages_.size(); i++) {
+    std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      input_sizes[c] =
+          std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled,
+                                 1 << channel_shifts_[i][c].first),
+                         DivCeil(frame_dimensions_.ysize_upsampled,
+                                 1 << channel_shifts_[i][c].second));
+    }
+    stages_[i]->SetInputSizes(input_sizes);
+    if (stages_[i]->SwitchToImageDimensions()) {
+      // We don't allow kInOut after switching to image dimensions.
+      JXL_ASSERT(i >= first_trailing_stage_);
+      first_image_dim_stage_ = i + 1;
+      stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_,
+                                     &frame_origin_);
+      break;
+    }
+  }
+  for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+    if (stages_[i]->SwitchToImageDimensions()) {
+      JXL_ABORT("Cannot switch to image dimensions multiple times");
+    }
+    std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      input_sizes[c] = {full_image_xsize_, full_image_ysize_};
+    }
+    stages_[i]->SetInputSizes(input_sizes);
+  }
+
+  anyc_.resize(stages_.size());
+  for (size_t i = 0; i < stages_.size(); i++) {
+    for (size_t c = 0; c < shifts.size(); c++) {
+      if (stages_[i]->GetChannelMode(c) !=
+          RenderPipelineChannelMode::kIgnored) {
+        anyc_[i] = c;
+      }
+    }
+  }
+
+  stage_input_for_channel_ = std::vector<std::vector<int32_t>>(
+      stages_.size(), std::vector<int32_t>(shifts.size()));
+  for (size_t c = 0; c < shifts.size(); c++) {
+    int input = -1;
+    for (size_t i = 0; i < stages_.size(); i++) {
+      stage_input_for_channel_[i][c] = input;
+      if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        input = i;
+      }
+    }
+  }
+
+  image_rect_.resize(stages_.size());
+  for (size_t i = 0; i < stages_.size(); i++) {
+    size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled,
+                        1 << channel_shifts_[i][anyc_[i]].first);
+    size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled,
+                        1 << channel_shifts_[i][anyc_[i]].second);
+    image_rect_[i] = Rect(0, 0, x1, y1);
+  }
+
+  virtual_ypadding_for_output_.resize(stages_.size());
+  xpadding_for_output_.resize(stages_.size());
+  for (size_t c = 0; c < shifts.size(); c++) {
+    int ypad = 0;
+    int xpad = 0;
+    for (size_t i = stages_.size(); i-- > 0;) {
+      if (stages_[i]->GetChannelMode(c) !=
+          RenderPipelineChannelMode::kIgnored) {
+        virtual_ypadding_for_output_[i] =
+            std::max(ypad, virtual_ypadding_for_output_[i]);
+        xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]);
+      }
+      if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) +
+                stages_[i]->settings_.border_y)
+               << channel_shifts_[i][c].second;
+        xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) +
+               stages_[i]->settings_.border_x;
+      }
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
+                                                        bool use_group_ids) {
+  const auto& shifts = channel_shifts_[0];
+
+  use_group_ids_ = use_group_ids;
+  size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
+  for (size_t t = group_data_.size(); t < num_buffers; t++) {
+    group_data_.emplace_back();
+    group_data_[t].resize(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2,
+                                 GroupInputYSize(c) + group_data_y_border_ * 2);
+    }
+  }
+  // TODO(veluca): avoid reallocating buffers if not needed.
+  stage_data_.resize(num);
+  size_t upsampling = 1u << base_color_shift_;
+  size_t group_dim = frame_dimensions_.group_dim * upsampling;
+  size_t padding =
+      2 * group_data_x_border_ * upsampling +  // maximum size of a rect
+      2 * kRenderPipelineXOffset;              // extra padding for processing
+  size_t stage_buffer_xsize = group_dim + padding;
+  for (size_t t = 0; t < num; t++) {
+    stage_data_[t].resize(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      stage_data_[t][c].resize(stages_.size());
+      size_t next_y_border = 0;
+      for (size_t i = stages_.size(); i-- > 0;) {
+        if (stages_[i]->GetChannelMode(c) ==
+            RenderPipelineChannelMode::kInOut) {
+          size_t stage_buffer_ysize =
+              2 * next_y_border + (1 << stages_[i]->settings_.shift_y);
+          stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize);
+          next_y_border = stages_[i]->settings_.border_y;
+          stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize);
+        }
+      }
+    }
+  }
+  if (first_image_dim_stage_ != stages_.size()) {
+    RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+                              frame_dimensions_.ysize_upsampled);
+    RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+    image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Intersection(full_image_rect);
+    if (image_rect.xsize() == 0 || image_rect.ysize() == 0) {
+      image_rect = RectT<ssize_t>(0, 0, 0, 0);
+    }
+    size_t left_padding = image_rect.x0();
+    size_t middle_padding = group_dim;
+    size_t right_padding = full_image_xsize_ - image_rect.x1();
+    size_t out_of_frame_xsize =
+        padding +
+        std::max(left_padding, std::max(middle_padding, right_padding));
+    out_of_frame_data_.resize(num);
+    for (size_t t = 0; t < num; t++) {
+      out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size());
+    }
+  }
+}
+
+std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers(
+    size_t group_id, size_t thread_id) {
+  std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size());
+  const size_t gx = group_id % frame_dimensions_.xsize_groups;
+  const size_t gy = group_id / frame_dimensions_.xsize_groups;
+  for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+    ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c];
+    ret[c].second = Rect(group_data_x_border_, group_data_y_border_,
+                         GroupInputXSize(c), GroupInputYSize(c),
+                         DivCeil(frame_dimensions_.xsize_upsampled,
+                                 1 << channel_shifts_[0][c].first) -
+                             gx * GroupInputXSize(c) + group_data_x_border_,
+                         DivCeil(frame_dimensions_.ysize_upsampled,
+                                 1 << channel_shifts_[0][c].second) -
+                             gy * GroupInputYSize(c) + group_data_y_border_);
+  }
+  return ret;
+}
+
+namespace {
+
+JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) {
+  if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) {
+    return Mirror(y, image_ysize);
+  }
+  if (y + group_y0 >= image_ysize) {
+    // Here we know that the one mirroring step is sufficient.
+    return 2 * image_ysize - (y + group_y0) - 1 - group_y0;
+  }
+  return y;
+}
+
+JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0,
+                                ssize_t group_xsize, ssize_t image_xsize) {
+  if (image_xsize <= borderx) {
+    if (group_x0 == 0) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset - ix - 1] =
+            row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)];
+      }
+    }
+    if (group_xsize + borderx + group_x0 >= image_xsize) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset + image_xsize + ix - group_x0] =
+            row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) -
+                group_x0];
+      }
+    }
+  } else {
+    // Here we know that the one mirroring step is sufficient.
+    if (group_x0 == 0) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix];
+      }
+    }
+    if (group_xsize + borderx + group_x0 >= image_xsize) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] =
+            row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1];
+      }
+    }
+  }
+}
+
+// Information about where the *output* of each stage is stored.
+class Rows {
+ public:
+  Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages,
+       const Rect data_max_color_channel_rect, int group_data_x_border,
+       int group_data_y_border,
+       const std::vector<std::pair<size_t, size_t>>& group_data_shift,
+       size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data,
+       std::vector<ImageF>& input_data) {
+    size_t num_stages = stages.size();
+    size_t num_channels = input_data.size();
+
+    JXL_ASSERT(thread_data.size() == num_channels);
+    JXL_ASSERT(group_data_shift.size() == num_channels);
+
+#if JXL_ENABLE_ASSERT
+    for (const auto& td : thread_data) {
+      JXL_ASSERT(td.size() == num_stages);
+    }
+#endif
+
+    rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels));
+
+    for (size_t i = 0; i < num_stages; i++) {
+      for (size_t c = 0; c < input_data.size(); c++) {
+        if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+          rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1;
+          rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0);
+          rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow();
+        }
+      }
+    }
+
+    for (size_t c = 0; c < input_data.size(); c++) {
+      auto channel_group_data_rect =
+          data_max_color_channel_rect.As<ssize_t>()
+              .Translate(-group_data_x_border, -group_data_y_border)
+              .ShiftLeft(base_color_shift)
+              .CeilShiftRight(group_data_shift[c])
+              .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset),
+                         group_data_y_border);
+      rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0);
+      rows_[0][c].stride = input_data[c].PixelsPerRow();
+      rows_[0][c].ymod_minus_1 = -1;
+    }
+  }
+
+  // Stage -1 refers to the input data; all other values must be nonnegative and
+  // refer to the data for the output of that stage.
+  JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const {
+    JXL_DASSERT(stage >= -1);
+    const RowInfo& info = rows_[stage + 1][c];
+    return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1);
+  }
+
+ private:
+  struct RowInfo {
+    // Pointer to beginning of the first row.
+    float* base_ptr;
+    // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of
+    // 2, which allows efficient mod computation by masking).
+    int ymod_minus_1;
+    // Number of floats per row.
+    size_t stride;
+  };
+  std::vector<std::vector<RowInfo>> rows_;
+};
+
+}  // namespace
+
+void LowMemoryRenderPipeline::RenderRect(size_t thread_id,
+                                         std::vector<ImageF>& input_data,
+                                         Rect data_max_color_channel_rect,
+                                         Rect image_max_color_channel_rect) {
+  // For each stage, the rect corresponding to the image area currently being
+  // processed, in the coordinates of that stage (i.e. with the scaling factor
+  // that that stage has).
+  std::vector<Rect> group_rect;
+  group_rect.resize(stages_.size());
+  Rect image_area_rect =
+      image_max_color_channel_rect.ShiftLeft(base_color_shift_)
+          .Crop(frame_dimensions_.xsize_upsampled,
+                frame_dimensions_.ysize_upsampled);
+  for (size_t i = 0; i < stages_.size(); i++) {
+    group_rect[i] =
+        image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]);
+  }
+
+  ssize_t frame_x0 =
+      first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0;
+  ssize_t frame_y0 =
+      first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0;
+  size_t full_image_xsize = first_image_dim_stage_ == stages_.size()
+                                ? frame_dimensions_.xsize_upsampled
+                                : full_image_xsize_;
+  size_t full_image_ysize = first_image_dim_stage_ == stages_.size()
+                                ? frame_dimensions_.ysize_upsampled
+                                : full_image_ysize_;
+
+  // Compute actual x-axis bounds for the current image area in the context of
+  // the full image this frame is part of. As the left boundary may be negative,
+  // we also create the x_pixels_skip value, defined as follows:
+  // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0;
+  // - full_image_x0 - x_pixels_skip is the position of the current frame area
+  //   in the full image.
+  ssize_t full_image_x0 = frame_x0 + image_area_rect.x0();
+  ssize_t x_pixels_skip = 0;
+  if (full_image_x0 < 0) {
+    x_pixels_skip = -full_image_x0;
+    full_image_x0 = 0;
+  }
+  ssize_t full_image_x1 = frame_x0 + image_area_rect.x1();
+  full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize);
+
+  // If the current image area is entirely outside of the visible image, there
+  // is no point in proceeding. Note: this uses the assumption that if there is
+  // a stage with observable effects (i.e. a kInput stage), it only appears
+  // after the stage that switches to image dimensions.
+  if (full_image_x1 <= full_image_x0) return;
+
+  // Data structures to hold information about input/output rows and their
+  // buffers.
+  Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_,
+            group_data_y_border_, channel_shifts_[0], base_color_shift_,
+            stage_data_[thread_id], input_data);
+
+  std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ +
+                                                       1);
+  for (size_t i = 0; i < first_trailing_stage_; i++) {
+    input_rows[i].resize(input_data.size());
+  }
+  input_rows[first_trailing_stage_].resize(input_data.size(),
+                                           std::vector<float*>(1));
+
+  // Maximum possible shift is 3.
+  RenderPipelineStage::RowInfo output_rows(input_data.size(),
+                                           std::vector<float*>(8));
+
+  // Fills in input_rows and output_rows for a given y value (relative to the
+  // start of the group, measured in actual pixels at the appropriate vertical
+  // scaling factor) and a given stage, applying mirroring if necessary. This
+  // function is somewhat inefficient for trailing kInOut or kInput stages,
+  // where just filling the input row once ought to be sufficient.
+  auto prepare_io_rows = [&](int y, size_t i) {
+    ssize_t bordery = stages_[i]->settings_.border_y;
+    size_t shifty = stages_[i]->settings_.shift_y;
+    auto make_row = [&](size_t c, ssize_t iy) {
+      size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(),
+                                       image_rect_[i].ysize());
+      input_rows[i][c][iy] =
+          rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c);
+      ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x,
+                      group_rect[i].x0(), group_rect[i].xsize(),
+                      image_rect_[i].xsize());
+    };
+    for (size_t c = 0; c < input_data.size(); c++) {
+      RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c);
+      if (mode == RenderPipelineChannelMode::kIgnored) {
+        continue;
+      }
+      // If we already have rows from a previous iteration, we can just shift
+      // the rows by 1 and insert the new one.
+      if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) {
+        for (ssize_t iy = 0; iy < 2 * bordery; iy++) {
+          input_rows[i][c][iy] = input_rows[i][c][iy + 1];
+        }
+        make_row(c, bordery * 2);
+      } else {
+        input_rows[i][c].resize(2 * bordery + 1);
+        for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) {
+          make_row(c, iy);
+        }
+      }
+
+      // If necessary, get the output buffers.
+      if (mode == RenderPipelineChannelMode::kInOut) {
+        for (size_t iy = 0; iy < (1u << shifty); iy++) {
+          output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c);
+        }
+      }
+    }
+  };
+
+  // We pretend that every stage has a vertical shift of 0, i.e. it is as tall
+  // as the final image.
+  // We call each such row a "virtual" row, because it may or may not correspond
+  // to an actual row of the current processing stage; actual processing happens
+  // when vy % (1<<vshift) == 0.
+
+  int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(),
+                                         virtual_ypadding_for_output_.end());
+
+  for (int vy = -num_extra_rows;
+       vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) {
+    for (size_t i = 0; i < first_trailing_stage_; i++) {
+      int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i];
+
+      if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) {
+        continue;
+      }
+
+      if (stage_vy < -virtual_ypadding_for_output_[i]) {
+        continue;
+      }
+
+      int y = stage_vy >> channel_shifts_[i][anyc_[i]].second;
+
+      ssize_t image_y = ssize_t(group_rect[i].y0()) + y;
+      // Do not produce rows in out-of-bounds areas.
+      if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) {
+        continue;
+      }
+
+      // Get the input/output rows and potentially apply mirroring to the input.
+      prepare_io_rows(y, i);
+
+      // Produce output rows.
+      stages_[i]->ProcessRow(input_rows[i], output_rows,
+                             xpadding_for_output_[i], group_rect[i].xsize(),
+                             group_rect[i].x0(), image_y, thread_id);
+    }
+
+    // Process trailing stages, i.e. the final set of non-kInOut stages; they
+    // all have the same input buffer and no need to use any mirroring.
+
+    int y = vy - num_extra_rows;
+
+    for (size_t c = 0; c < input_data.size(); c++) {
+      // Skip pixels that are not part of the actual final image area.
+      input_rows[first_trailing_stage_][c][0] =
+          rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y,
+                         c) +
+          x_pixels_skip;
+    }
+
+    // Check that we are not outside of the bounds for the current rendering
+    // rect. Not doing so might result in overwriting some rows that have been
+    // written (or will be written) by other threads.
+    if (y < 0 || y >= ssize_t(image_area_rect.ysize())) {
+      continue;
+    }
+
+    // Avoid running pipeline stages on pixels that are outside the full image
+    // area. As trailing stages have no borders, this is a free optimization
+    // (and may be necessary for correctness, as some stages assume coordinates
+    // are within bounds).
+    ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y;
+    if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) {
+      continue;
+    }
+
+    for (size_t i = first_trailing_stage_; i < stages_.size(); i++) {
+      // Before the first_image_dim_stage_, coordinates are relative to the
+      // current frame.
+      size_t x0 =
+          i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0;
+      size_t y =
+          i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y;
+      stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows,
+                             /*xextra=*/0, full_image_x1 - full_image_x0, x0, y,
+                             thread_id);
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) {
+  if (rect.xsize() == 0) return;
+  size_t numc = channel_shifts_[0].size();
+  RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1));
+  RenderPipelineStage::RowInfo output_rows;
+
+  for (size_t c = 0; c < numc; c++) {
+    input_rows[c][0] = out_of_frame_data_[thread_id].Row(c);
+  }
+
+  for (size_t y = 0; y < rect.ysize(); y++) {
+    stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow(
+        input_rows, rect.xsize(), rect.x0(), rect.y0() + y);
+    for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+      stages_[i]->ProcessRow(input_rows, output_rows,
+                             /*xextra=*/0, rect.xsize(), rect.x0(),
+                             rect.y0() + y, thread_id);
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id,
+                                             size_t thread_id) {
+  std::vector<ImageF>& input_data =
+      group_data_[use_group_ids_ ? group_id : thread_id];
+
+  // Copy the group borders to the border storage.
+  for (size_t c = 0; c < input_data.size(); c++) {
+    SaveBorders(group_id, c, input_data[c]);
+  }
+
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+
+  if (first_image_dim_stage_ != stages_.size()) {
+    size_t group_dim = frame_dimensions_.group_dim << base_color_shift_;
+    RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim,
+                              group_dim);
+    RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+                              frame_dimensions_.ysize_upsampled);
+    RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+    group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Intersection(full_image_rect);
+    group_rect = group_rect.Intersection(image_rect);
+    size_t x0 = group_rect.x0();
+    size_t y0 = group_rect.y0();
+    size_t x1 = group_rect.x1();
+    size_t y1 = group_rect.y1();
+    JXL_DEBUG_V(6,
+                "Rendering padding for full image rect %s "
+                "outside group rect %s",
+                Description(full_image_rect).c_str(),
+                Description(group_rect).c_str());
+
+    if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) {
+      // If this frame does not intersect with the full image, we have to
+      // initialize the whole image area with RenderPadding.
+      RenderPadding(thread_id,
+                    Rect(0, 0, full_image_xsize_, full_image_ysize_));
+    }
+
+    // Render padding for groups that intersect with the full image. The case
+    // where no groups intersect was handled above.
+    if (group_rect.xsize() > 0 && group_rect.ysize() > 0) {
+      if (gx == 0 && gy == 0) {
+        RenderPadding(thread_id, Rect(0, 0, x0, y0));
+      }
+      if (gy == 0) {
+        RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0));
+      }
+      if (gx == 0) {
+        RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0));
+      }
+      if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) {
+        RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1));
+      }
+      if (gy + 1 == frame_dimensions_.ysize_groups) {
+        RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1));
+      }
+      if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0));
+      }
+      if (gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0));
+      }
+      if (gy + 1 == frame_dimensions_.ysize_groups &&
+          gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1,
+                                      full_image_ysize_ - y1));
+      }
+    }
+  }
+
+  Rect ready_rects[GroupBorderAssigner::kMaxToFinalize];
+  size_t num_ready_rects = 0;
+  group_border_assigner_.GroupDone(group_id, group_border_.first,
+                                   group_border_.second, ready_rects,
+                                   &num_ready_rects);
+  for (size_t i = 0; i < num_ready_rects; i++) {
+    const Rect& image_max_color_channel_rect = ready_rects[i];
+    for (size_t c = 0; c < input_data.size(); c++) {
+      LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]);
+    }
+    Rect data_max_color_channel_rect(
+        group_data_x_border_ + image_max_color_channel_rect.x0() -
+            gx * frame_dimensions_.group_dim,
+        group_data_y_border_ + image_max_color_channel_rect.y0() -
+            gy * frame_dimensions_.group_dim,
+        image_max_color_channel_rect.xsize(),
+        image_max_color_channel_rect.ysize());
+    RenderRect(thread_id, input_data, data_max_color_channel_rect,
+               image_max_color_channel_rect);
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
new file mode 100644
index 0000000000..b386f7c078
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A multithreaded, low-memory rendering pipeline that only allocates a minimal
+// amount of buffers.
+class LowMemoryRenderPipeline final : public RenderPipeline {
+ private:
+  std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) override;
+
+  void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+  void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+  void ClearDone(size_t i) override { group_border_assigner_.ClearDone(i); }
+
+  void Init() override;
+
+  void EnsureBordersStorage();
+  size_t GroupInputXSize(size_t c) const;
+  size_t GroupInputYSize(size_t c) const;
+  void RenderRect(size_t thread_id, std::vector<ImageF>& input_data,
+                  Rect data_max_color_channel_rect,
+                  Rect image_max_color_channel_rect);
+  void RenderPadding(size_t thread_id, Rect rect);
+
+  void SaveBorders(size_t group_id, size_t c, const ImageF& in);
+  void LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out);
+
+  std::pair<size_t, size_t> ColorDimensionsToChannelDimensions(
+      std::pair<size_t, size_t> in, size_t c, size_t stage) const;
+
+  std::pair<size_t, size_t> BorderToStore(size_t c) const;
+
+  bool use_group_ids_;
+
+  // Storage for borders between groups. Borders of adjacent groups are stacked
+  // together, e.g. bottom border of current group is followed by top border
+  // of next group.
+  std::vector<ImageF> borders_horizontal_;
+  std::vector<ImageF> borders_vertical_;
+
+  // Manages the status of borders.
+  GroupBorderAssigner group_border_assigner_;
+
+  // Size (in color-channel-pixels) of the border around each group that might
+  // be assigned to that group.
+  std::pair<size_t, size_t> group_border_;
+  // base_color_shift_ defines the size of groups in terms of final image
+  // pixels.
+  size_t base_color_shift_;
+
+  // Buffer for decoded pixel data for a group, indexed by [thread][channel] or
+  // [group][channel] depending on `use_group_ids_`.
+  std::vector<std::vector<ImageF>> group_data_;
+
+  // Borders for storing group data.
+  size_t group_data_x_border_;
+  size_t group_data_y_border_;
+
+  // Buffers for intermediate rows for the various stages, indexed by
+  // [thread][channel][stage].
+  std::vector<std::vector<std::vector<ImageF>>> stage_data_;
+
+  // Buffers for out-of-frame data, indexed by [thread]; every row is a
+  // different channel.
+  std::vector<ImageF> out_of_frame_data_;
+
+  // For each stage, a non-kIgnored channel.
+  std::vector<int32_t> anyc_;
+
+  // Size of the image at each stage.
+  std::vector<Rect> image_rect_;
+
+  // For each stage, for each channel, keep track of the kInOut stage that
+  // produced the input to that stage (which corresponds to the buffer index
+  // containing the data). -1 if data comes from the original input.
+  std::vector<std::vector<int32_t>> stage_input_for_channel_;
+
+  // Number of (virtual) extra rows that must be processed at each stage
+  // to produce sufficient output for future stages.
+  std::vector<int> virtual_ypadding_for_output_;
+
+  // Same thing for columns, except these are real columns and not virtual ones.
+  std::vector<int> xpadding_for_output_;
+
+  // First stage that doesn't have any kInOut channel.
+  size_t first_trailing_stage_;
+
+  // Origin and size of the frame after switching to image dimensions.
+  FrameOrigin frame_origin_;
+  size_t full_image_xsize_;
+  size_t full_image_ysize_;
+  size_t first_image_dim_stage_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
new file mode 100644
index 0000000000..68b6ef613f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <algorithm>
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void RenderPipeline::Builder::AddStage(
+    std::unique_ptr<RenderPipelineStage> stage) {
+  stages_.push_back(std::move(stage));
+}
+
+std::unique_ptr<RenderPipeline> RenderPipeline::Builder::Finalize(
+    FrameDimensions frame_dimensions) && {
+#if JXL_ENABLE_ASSERT
+  // Check that the last stage is not an kInOut stage for any channel, and that
+  // there is at least one stage.
+  JXL_ASSERT(!stages_.empty());
+  for (size_t c = 0; c < num_c_; c++) {
+    JXL_ASSERT(stages_.back()->GetChannelMode(c) !=
+               RenderPipelineChannelMode::kInOut);
+  }
+#endif
+
+  std::unique_ptr<RenderPipeline> res;
+  if (use_simple_implementation_) {
+    res = jxl::make_unique<SimpleRenderPipeline>();
+  } else {
+    res = jxl::make_unique<LowMemoryRenderPipeline>();
+  }
+
+  res->padding_.resize(stages_.size());
+  for (size_t i = stages_.size(); i-- > 0;) {
+    const auto& stage = stages_[i];
+    res->padding_[i].resize(num_c_);
+    if (i + 1 == stages_.size()) {
+      continue;
+    }
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->padding_[i][c].first = DivCeil(res->padding_[i + 1][c].first,
+                                            1 << stage->settings_.shift_x) +
+                                    stage->settings_.border_x;
+        res->padding_[i][c].second = DivCeil(res->padding_[i + 1][c].second,
+                                             1 << stage->settings_.shift_y) +
+                                     stage->settings_.border_y;
+      } else {
+        res->padding_[i][c] = res->padding_[i + 1][c];
+      }
+    }
+  }
+
+  res->frame_dimensions_ = frame_dimensions;
+  res->group_completed_passes_.resize(frame_dimensions.num_groups);
+  res->channel_shifts_.resize(stages_.size());
+  res->channel_shifts_[0].resize(num_c_);
+  for (size_t i = 1; i < stages_.size(); i++) {
+    auto& stage = stages_[i - 1];
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->channel_shifts_[0][c].first += stage->settings_.shift_x;
+        res->channel_shifts_[0][c].second += stage->settings_.shift_y;
+      }
+    }
+  }
+  for (size_t i = 1; i < stages_.size(); i++) {
+    auto& stage = stages_[i - 1];
+    res->channel_shifts_[i].resize(num_c_);
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->channel_shifts_[i][c].first =
+            res->channel_shifts_[i - 1][c].first - stage->settings_.shift_x;
+        res->channel_shifts_[i][c].second =
+            res->channel_shifts_[i - 1][c].second - stage->settings_.shift_y;
+      } else {
+        res->channel_shifts_[i][c].first = res->channel_shifts_[i - 1][c].first;
+        res->channel_shifts_[i][c].second =
+            res->channel_shifts_[i - 1][c].second;
+      }
+    }
+  }
+  res->stages_ = std::move(stages_);
+  res->Init();
+  return res;
+}
+
+RenderPipelineInput RenderPipeline::GetInputBuffers(size_t group_id,
+                                                    size_t thread_id) {
+  RenderPipelineInput ret;
+  JXL_DASSERT(group_id < group_completed_passes_.size());
+  ret.group_id_ = group_id;
+  ret.thread_id_ = thread_id;
+  ret.pipeline_ = this;
+  ret.buffers_ = PrepareBuffers(group_id, thread_id);
+  return ret;
+}
+
+void RenderPipeline::InputReady(
+    size_t group_id, size_t thread_id,
+    const std::vector<std::pair<ImageF*, Rect>>& buffers) {
+  JXL_DASSERT(group_id < group_completed_passes_.size());
+  group_completed_passes_[group_id]++;
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    (void)i;
+    JXL_CHECK_PLANE_INITIALIZED(*buffers[i].first, buffers[i].second, i);
+  }
+
+  ProcessBuffers(group_id, thread_id);
+}
+
+Status RenderPipeline::PrepareForThreads(size_t num, bool use_group_ids) {
+  for (const auto& stage : stages_) {
+    JXL_RETURN_IF_ERROR(stage->PrepareForThreads(num));
+  }
+  PrepareForThreadsInternal(num, use_group_ids);
+  return true;
+}
+
+void RenderPipelineInput::Done() {
+  JXL_ASSERT(pipeline_);
+  pipeline_->InputReady(group_id_, thread_id_, buffers_);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
new file mode 100644
index 0000000000..bf3ad4975e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
@@ -0,0 +1,139 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Interface to provide input to the rendering pipeline. When this object is
+// destroyed, all the data in the provided ImageF's Rects must have been
+// initialized.
+class RenderPipelineInput {
+ public:
+  RenderPipelineInput(const RenderPipelineInput&) = delete;
+  RenderPipelineInput(RenderPipelineInput&& other) noexcept {
+    *this = std::move(other);
+  }
+  RenderPipelineInput& operator=(RenderPipelineInput&& other) noexcept {
+    pipeline_ = other.pipeline_;
+    group_id_ = other.group_id_;
+    thread_id_ = other.thread_id_;
+    buffers_ = std::move(other.buffers_);
+    other.pipeline_ = nullptr;
+    return *this;
+  }
+
+  RenderPipelineInput() = default;
+  void Done();
+
+  const std::pair<ImageF*, Rect>& GetBuffer(size_t c) const {
+    JXL_ASSERT(c < buffers_.size());
+    return buffers_[c];
+  }
+
+ private:
+  RenderPipeline* pipeline_ = nullptr;
+  size_t group_id_;
+  size_t thread_id_;
+  std::vector<std::pair<ImageF*, Rect>> buffers_;
+  friend class RenderPipeline;
+};
+
+class RenderPipeline {
+ public:
+  class Builder {
+   public:
+    explicit Builder(size_t num_c) : num_c_(num_c) { JXL_ASSERT(num_c > 0); }
+
+    // Adds a stage to the pipeline. Must be called at least once; the last
+    // added stage cannot have kInOut channels.
+    void AddStage(std::unique_ptr<RenderPipelineStage> stage);
+
+    // Enables using the simple (i.e. non-memory-efficient) implementation of
+    // the pipeline.
+    void UseSimpleImplementation() { use_simple_implementation_ = true; }
+
+    // Finalizes setup of the pipeline. Shifts for all channels should be 0 at
+    // this point.
+    std::unique_ptr<RenderPipeline> Finalize(
+        FrameDimensions frame_dimensions) &&;
+
+   private:
+    std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+    size_t num_c_;
+    bool use_simple_implementation_ = false;
+  };
+
+  friend class Builder;
+
+  virtual ~RenderPipeline() = default;
+
+  Status IsInitialized() const {
+    for (const auto& stage : stages_) {
+      JXL_RETURN_IF_ERROR(stage->IsInitialized());
+    }
+    return true;
+  }
+
+  // Allocates storage to run with `num` threads. If `use_group_ids` is true,
+  // storage is allocated for each group, not each thread. The behaviour is
+  // undefined if calling this function multiple times with a different value
+  // for `use_group_ids`.
+  Status PrepareForThreads(size_t num, bool use_group_ids);
+
+  // Retrieves a buffer where input data should be stored by the callee. When
+  // input has been provided for all buffers, the pipeline will complete its
+  // processing. This method may be called multiple times concurrently from
+  // different threads, provided that a different `thread_id` is given.
+  RenderPipelineInput GetInputBuffers(size_t group_id, size_t thread_id);
+
+  size_t PassesWithAllInput() const {
+    return *std::min_element(group_completed_passes_.begin(),
+                             group_completed_passes_.end());
+  }
+
+  virtual void ClearDone(size_t i) {}
+
+ protected:
+  std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+  // Shifts for every channel at the input of each stage.
+  std::vector<std::vector<std::pair<size_t, size_t>>> channel_shifts_;
+
+  // Amount of (cumulative) padding required by each stage and channel, in
+  // either direction.
+  std::vector<std::vector<std::pair<size_t, size_t>>> padding_;
+
+  FrameDimensions frame_dimensions_;
+
+  std::vector<uint8_t> group_completed_passes_;
+
+  friend class RenderPipelineInput;
+
+ private:
+  void InputReady(size_t group_id, size_t thread_id,
+                  const std::vector<std::pair<ImageF*, Rect>>& buffers);
+
+  virtual std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) = 0;
+
+  virtual void ProcessBuffers(size_t group_id, size_t thread_id) = 0;
+
+  // Note that this method may be called multiple times with different (or
+  // equal) `num`.
+  virtual void PrepareForThreadsInternal(size_t num, bool use_group_ids) = 0;
+
+  // Called once frame dimensions and stages are known.
+  virtual void Init() {}
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
new file mode 100644
index 0000000000..d1a0074161
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
@@ -0,0 +1,171 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/frame_header.h"
+
+namespace jxl {
+
+// The first pixel in the input to RenderPipelineStage will be located at
+// this position. Pixels before this position may be accessed as padding.
+// This should be at least the RoundUpTo(maximum padding / 2, maximum vector
+// size) times 2: this is realized when using Gaborish + EPF + upsampling +
+// chroma subsampling.
+#if JXL_ARCH_ARM
+constexpr size_t kRenderPipelineXOffset = 16;
+#else
+constexpr size_t kRenderPipelineXOffset = 32;
+#endif
+
+enum class RenderPipelineChannelMode {
+  // This channel is not modified by this stage.
+  kIgnored = 0,
+  // This channel is modified in-place.
+  kInPlace = 1,
+  // This channel is modified and written to a new buffer.
+  kInOut = 2,
+  // This channel is only read. These are the only stages that are assumed to
+  // have observable effects, i.e. calls to ProcessRow for other stages may be
+  // omitted if it can be shown they can't affect any kInput stage ProcessRow
+  // call that happens inside image boundaries.
+  kInput = 3,
+};
+
+class RenderPipeline;
+
+class RenderPipelineStage {
+ protected:
+  using Row = float*;
+  using ChannelRows = std::vector<Row>;
+
+ public:
+  using RowInfo = std::vector<ChannelRows>;
+  struct Settings {
+    // Amount of padding required in the various directions by all channels
+    // that have kInOut mode.
+    size_t border_x = 0;
+    size_t border_y = 0;
+
+    // Log2 of the number of columns/rows of output that this stage will produce
+    // for every input row for kInOut channels.
+    size_t shift_x = 0;
+    size_t shift_y = 0;
+
+    static Settings ShiftX(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_x = border;
+      settings.shift_x = shift;
+      return settings;
+    }
+
+    static Settings ShiftY(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_y = border;
+      settings.shift_y = shift;
+      return settings;
+    }
+
+    static Settings Symmetric(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_x = settings.border_y = border;
+      settings.shift_x = settings.shift_y = shift;
+      return settings;
+    }
+
+    static Settings SymmetricBorderOnly(size_t border) {
+      return Symmetric(0, border);
+    }
+  };
+
+  virtual ~RenderPipelineStage() = default;
+
+  // Processes one row of input, producing the appropriate number of rows of
+  // output. Input/output rows can be obtained by calls to
+  // `GetInputRow`/`GetOutputRow`. `xsize+2*xextra` represents the total number
+  // of pixels to be processed in the input row, where the first pixel is at
+  // position `kRenderPipelineXOffset-xextra`. All pixels in the
+  // `[kRenderPipelineXOffset-xextra-border_x,
+  // kRenderPipelineXOffset+xsize+xextra+border_x)` range are initialized and
+  // accessible. `xpos` and `ypos` represent the position of the first
+  // (non-extra, i.e. in position kRenderPipelineXOffset) pixel in the center
+  // row of the input in the full image. `xpos` is a multiple of
+  // `GroupBorderAssigner::kPaddingXRound`. If `settings_.temp_buffer_size` is
+  // nonzero, `temp` will point to an HWY-aligned buffer of at least that number
+  // of floats; concurrent calls will have different buffers.
+  virtual void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                          size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                          size_t thread_id) const = 0;
+
+  // How each channel will be processed. Channels are numbered starting from
+  // color channels (always 3) and followed by all other channels.
+  virtual RenderPipelineChannelMode GetChannelMode(size_t c) const = 0;
+
+ protected:
+  explicit RenderPipelineStage(Settings settings) : settings_(settings) {}
+
+  virtual Status IsInitialized() const { return true; }
+
+  // Informs the stage about the total size of each channel. Few stages will
+  // actually need to use this information.
+  virtual void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) {}
+
+  virtual Status PrepareForThreads(size_t num_threads) { return true; }
+
+  // Returns a pointer to the input row of channel `c` with offset `y`.
+  // `y` must be in [-settings_.border_y, settings_.border_y]. `c` must be such
+  // that `GetChannelMode(c) != kIgnored`. The returned pointer points to the
+  // offset-ed row (i.e. kRenderPipelineXOffset has been applied).
+  float* GetInputRow(const RowInfo& input_rows, size_t c, int offset) const {
+    JXL_DASSERT(GetChannelMode(c) != RenderPipelineChannelMode::kIgnored);
+    JXL_DASSERT(-offset <= static_cast<int>(settings_.border_y));
+    JXL_DASSERT(offset <= static_cast<int>(settings_.border_y));
+    return input_rows[c][settings_.border_y + offset] + kRenderPipelineXOffset;
+  }
+  // Similar to `GetInputRow`, but can only be used if `GetChannelMode(c) ==
+  // kInOut`. Offset must be less than `1<<settings_.shift_y`.. The returned
+  // pointer points to the offset-ed row (i.e. kRenderPipelineXOffset has been
+  // applied).
+  float* GetOutputRow(const RowInfo& output_rows, size_t c,
+                      size_t offset) const {
+    JXL_DASSERT(GetChannelMode(c) == RenderPipelineChannelMode::kInOut);
+    JXL_DASSERT(offset <= 1ul << settings_.shift_y);
+    return output_rows[c][offset] + kRenderPipelineXOffset;
+  }
+
+  // Indicates whether, from this stage on, the pipeline will operate on an
+  // image- rather than frame-sized buffer. Only one stage in the pipeline
+  // should return true, and it should implement ProcessPaddingRow below too.
+  // It is assumed that, if there is a SwitchToImageDimensions() == true stage,
+  // all kInput stages appear after it.
+  virtual bool SwitchToImageDimensions() const { return false; }
+
+  // If SwitchToImageDimensions returns true, then this should set xsize and
+  // ysize to the image size, and frame_origin to the location of the frame
+  // within the image. Otherwise, this is not called at all.
+  virtual void GetImageDimensions(size_t* xsize, size_t* ysize,
+                                  FrameOrigin* frame_origin) const {}
+
+  // Produces the appropriate output data outside of the frame dimensions. xpos
+  // and ypos are now relative to the full image.
+  virtual void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize,
+                                 size_t xpos, size_t ypos) const {}
+
+  virtual const char* GetName() const = 0;
+
+  Settings settings_;
+  friend class RenderPipeline;
+  friend class SimpleRenderPipeline;
+  friend class LowMemoryRenderPipeline;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
new file mode 100644
index 0000000000..f638807be9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
@@ -0,0 +1,562 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/render_pipeline/test_render_pipeline_stages.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+Status DecodeFile(const Span<const uint8_t> file, bool use_slow_pipeline,
+                  CodecInOut* io, ThreadPool* pool) {
+  Status ret = true;
+  {
+    BitReader reader(file);
+    BitReaderScopedCloser reader_closer(&reader, &ret);
+    JXL_RETURN_IF_ERROR(reader.ReadFixedBits<16>() == 0x0AFF);
+    JXL_RETURN_IF_ERROR(ReadSizeHeader(&reader, &io->metadata.size));
+    JXL_RETURN_IF_ERROR(ReadImageMetadata(&reader, &io->metadata.m));
+    io->metadata.transform_data.nonserialized_xyb_encoded =
+        io->metadata.m.xyb_encoded;
+    JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data));
+    if (io->metadata.m.color_encoding.WantICC()) {
+      PaddedBytes icc;
+      JXL_RETURN_IF_ERROR(ReadICC(&reader, &icc));
+      JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(std::move(icc)));
+    }
+    PassesDecoderState dec_state;
+    JXL_RETURN_IF_ERROR(
+        dec_state.output_encoding_info.SetFromMetadata(io->metadata));
+    JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary());
+    io->frames.clear();
+    do {
+      io->frames.emplace_back(&io->metadata.m);
+      // Skip frames that are not displayed.
+      do {
+        size_t frame_start = reader.TotalBitsConsumed() / kBitsPerByte;
+        size_t size_left = file.size() - frame_start;
+        JXL_RETURN_IF_ERROR(
+            DecodeFrame(&dec_state, pool, file.data() + frame_start, size_left,
+                        &io->frames.back(), io->metadata, use_slow_pipeline));
+        reader.SkipBits(io->frames.back().decoded_bytes() * kBitsPerByte);
+      } while (dec_state.shared->frame_header.frame_type !=
+                   FrameType::kRegularFrame &&
+               dec_state.shared->frame_header.frame_type !=
+                   FrameType::kSkipProgressive);
+    } while (!dec_state.shared->frame_header.is_last);
+
+    if (io->frames.empty()) return JXL_FAILURE("Not enough data.");
+
+    if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) {
+      return JXL_FAILURE("Reader position not at EOF.");
+    }
+    if (!reader.AllReadsWithinBounds()) {
+      return JXL_FAILURE("Reader out of bounds read.");
+    }
+    io->CheckMetadata();
+    // reader is closed here.
+  }
+  return ret;
+}
+
+TEST(RenderPipelineTest, Build) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroups) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  auto pipeline = std::move(builder).Finalize(frame_dimensions);
+  ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+  for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+    auto input_buffers = pipeline->GetInputBuffers(i, 0);
+    FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+              input_buffers.GetBuffer(0).second);
+    input_buffers.Done();
+  }
+
+  EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+TEST(RenderPipelineTest, BuildFast) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroupsFast) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  auto pipeline = std::move(builder).Finalize(frame_dimensions);
+  ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+  for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+    auto input_buffers = pipeline->GetInputBuffers(i, 0);
+    FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+              input_buffers.GetBuffer(0).second);
+    input_buffers.Done();
+  }
+
+  EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+struct RenderPipelineTestInputSettings {
+  // Input image.
+  std::string input_path;
+  size_t xsize, ysize;
+  bool jpeg_transcode = false;
+  // Encoding settings.
+  CompressParams cparams;
+  // Short name for the encoder settings.
+  std::string cparams_descr;
+
+  bool add_spot_color = false;
+
+  Splines splines;
+};
+
+class RenderPipelineTestParam
+    : public ::testing::TestWithParam<RenderPipelineTestInputSettings> {};
+
+TEST_P(RenderPipelineTestParam, PipelineTest) {
+  RenderPipelineTestInputSettings config = GetParam();
+
+  // Use a parallel runner that randomly shuffles tasks to detect possible
+  // border handling bugs.
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData(config.input_path);
+
+  CodecInOut io;
+  if (config.jpeg_transcode) {
+    ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
+  } else {
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  }
+  io.ShrinkTo(config.xsize, config.ysize);
+
+  if (config.add_spot_color) {
+    jxl::ImageF spot(config.xsize, config.ysize);
+    jxl::ZeroFillImage(&spot);
+
+    for (size_t y = 0; y < config.ysize; y++) {
+      float* JXL_RESTRICT row = spot.Row(y);
+      for (size_t x = 0; x < config.xsize; x++) {
+        row[x] = ((x ^ y) & 255) * (1.f / 255.f);
+      }
+    }
+    ExtraChannelInfo info;
+    info.bit_depth.bits_per_sample = 8;
+    info.dim_shift = 0;
+    info.type = jxl::ExtraChannel::kSpotColor;
+    info.spot_color[0] = 0.5f;
+    info.spot_color[1] = 0.2f;
+    info.spot_color[2] = 1.f;
+    info.spot_color[3] = 0.5f;
+
+    io.metadata.m.extra_channel_info.push_back(info);
+    std::vector<jxl::ImageF> ec;
+    ec.push_back(std::move(spot));
+    io.frames[0].SetExtraChannels(std::move(ec));
+  }
+
+  PaddedBytes compressed;
+
+  PassesEncoderState enc_state;
+  enc_state.shared.image_features.splines = config.splines;
+  ASSERT_TRUE(EncodeFile(config.cparams, &io, &enc_state, &compressed,
+                         GetJxlCms(), /*aux_out=*/nullptr, &pool));
+
+
+  CodecInOut io_default;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/false, &io_default, &pool));
+  CodecInOut io_slow_pipeline;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+  ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+  for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+    constexpr float kMaxError = 1e-5;
+#else
+    constexpr float kMaxError = 1e-4;
+#endif
+    Image3F def = std::move(*io_default.frames[i].color());
+    Image3F pip = std::move(*io_slow_pipeline.frames[i].color());
+    JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _));
+    for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+         ec++) {
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+    }
+  }
+}
+
+Splines CreateTestSplines() {
+  const ColorCorrelationMap cmap;
+  std::vector<Spline::Point> control_points{{9, 54},  {118, 159}, {97, 3},
+                                            {10, 40}, {150, 25},  {120, 300}};
+  const Spline spline{
+      control_points,
+      /*color_dct=*/
+      {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}},
+      /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}};
+  std::vector<Spline> spline_data = {spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, /*quantization_adjustment=*/0,
+                                   cmap.YtoXRatio(0), cmap.YtoBRatio(0));
+    starting_points.push_back(spline.control_points.front());
+  }
+  return Splines(/*quantization_adjustment=*/0, std::move(quantized_splines),
+                 std::move(starting_points));
+}
+
+std::vector<RenderPipelineTestInputSettings> GeneratePipelineTests() {
+  std::vector<RenderPipelineTestInputSettings> all_tests;
+
+  std::pair<size_t, size_t> sizes[] = {
+      {3, 8}, {128, 128}, {256, 256}, {258, 258}, {533, 401}, {777, 777},
+  };
+
+  for (auto size : sizes) {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/flower/flower.png";
+    settings.xsize = size.first;
+    settings.ysize = size.second;
+
+    // Base settings.
+    settings.cparams.butteraugli_distance = 1.0;
+    settings.cparams.patches = Override::kOff;
+    settings.cparams.dots = Override::kOff;
+    settings.cparams.gaborish = Override::kOff;
+    settings.cparams.epf = 0;
+    settings.cparams.color_transform = ColorTransform::kXYB;
+
+    {
+      auto s = settings;
+      s.cparams_descr = "NoGabNoEpfNoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.color_transform = ColorTransform::kNone;
+      s.cparams_descr = "NoGabNoEpfNoPatchesNoXYB";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.gaborish = Override::kOn;
+      s.cparams_descr = "GabNoEpfNoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 1;
+      s.cparams_descr = "NoGabEpf1NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 2;
+      s.cparams_descr = "NoGabEpf2NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 3;
+      s.cparams_descr = "NoGabEpf3NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.gaborish = Override::kOn;
+      s.cparams.epf = 3;
+      s.cparams_descr = "GabEpf3NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "Splines";
+      s.splines = CreateTestSplines();
+      all_tests.push_back(s);
+    }
+
+    for (size_t ups : {2, 4, 8}) {
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams_descr = "Ups" + std::to_string(ups);
+        all_tests.push_back(s);
+      }
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams.epf = 1;
+        s.cparams_descr = "Ups" + std::to_string(ups) + "EPF1";
+        all_tests.push_back(s);
+      }
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams.gaborish = Override::kOn;
+        s.cparams.epf = 1;
+        s.cparams_descr = "Ups" + std::to_string(ups) + "GabEPF1";
+        all_tests.push_back(s);
+      }
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "Noise";
+      s.cparams.photon_noise_iso = 3200;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "NoiseUps";
+      s.cparams.photon_noise_iso = 3200;
+      s.cparams.resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ModularLossless";
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 0;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ProgressiveDC";
+      s.cparams.progressive_dc = 1;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ModularLossy";
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 1.f;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaVarDCT";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaVarDCTUpsamplingEPF";
+      s.cparams.epf = 1;
+      s.cparams.ec_resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 0;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaLossless";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaDownsample";
+      s.cparams.ec_resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "SpotColor";
+      s.add_spot_color = true;
+      all_tests.push_back(s);
+    }
+  }
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  for (const char* input : {"jxl/flower/flower.png.im_q85_444.jpg",
+                            "jxl/flower/flower.png.im_q85_420.jpg",
+                            "jxl/flower/flower.png.im_q85_422.jpg",
+                            "jxl/flower/flower.png.im_q85_440.jpg"}) {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = input;
+    settings.jpeg_transcode = true;
+    settings.xsize = 2268;
+    settings.ysize = 1512;
+    settings.cparams_descr = "Default";
+    all_tests.push_back(settings);
+  }
+
+#endif
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams_descr = "Patches";
+    all_tests.push_back(settings);
+  }
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams.photon_noise_iso = 1000;
+    settings.cparams_descr = "PatchesAndNoise";
+    all_tests.push_back(settings);
+  }
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams.resampling = 2;
+    settings.cparams_descr = "PatchesAndUps2";
+    all_tests.push_back(settings);
+  }
+
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const RenderPipelineTestInputSettings& c) {
+  std::string filename;
+  size_t pos = c.input_path.find_last_of('/');
+  if (pos == std::string::npos) {
+    filename = c.input_path;
+  } else {
+    filename = c.input_path.substr(pos + 1);
+  }
+  std::replace_if(
+      filename.begin(), filename.end(), [](char c) { return !isalnum(c); },
+      '_');
+  os << filename << "_" << (c.jpeg_transcode ? "JPEG_" : "") << c.xsize << "x"
+     << c.ysize << "_" << c.cparams_descr;
+  return os;
+}
+
+std::string PipelineTestDescription(
+    const testing::TestParamInfo<RenderPipelineTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RenderPipelineTest, RenderPipelineTestParam,
+                                   testing::ValuesIn(GeneratePipelineTests()),
+                                   PipelineTestDescription);
+
+TEST(RenderPipelineDecodingTest, Animation) {
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+
+  PaddedBytes compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+
+  CodecInOut io_default;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/false, &io_default, &pool));
+  CodecInOut io_slow_pipeline;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+  ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+  for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+    constexpr float kMaxError = 1e-5;
+#else
+    constexpr float kMaxError = 1e-4;
+#endif
+
+    Image3F fast_pipeline = std::move(*io_default.frames[i].color());
+    Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color());
+    JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError,
+                                      kMaxError, _))
+    for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+         ec++) {
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
new file mode 100644
index 0000000000..4495288860
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
@@ -0,0 +1,266 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+
+#include <hwy/base.h>
+
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void SimpleRenderPipeline::PrepareForThreadsInternal(size_t num,
+                                                     bool use_group_ids) {
+  if (!channel_data_.empty()) {
+    return;
+  }
+  auto ch_size = [](size_t frame_size, size_t shift) {
+    return DivCeil(frame_size, 1 << shift) + kRenderPipelineXOffset * 2;
+  };
+  for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+    channel_data_.push_back(ImageF(
+        ch_size(frame_dimensions_.xsize_upsampled, channel_shifts_[0][c].first),
+        ch_size(frame_dimensions_.ysize_upsampled,
+                channel_shifts_[0][c].second)));
+    msan::PoisonImage(channel_data_.back());
+  }
+}
+
+Rect SimpleRenderPipeline::MakeChannelRect(size_t group_id, size_t channel) {
+  size_t base_color_shift =
+      CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+                      frame_dimensions_.xsize_padded);
+
+  const size_t gx = group_id % frame_dimensions_.xsize_groups;
+  const size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t xgroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+                     channel_shifts_[0][channel].first;
+  size_t ygroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+                     channel_shifts_[0][channel].second;
+  return Rect(
+      kRenderPipelineXOffset + gx * xgroupdim,
+      kRenderPipelineXOffset + gy * ygroupdim, xgroupdim, ygroupdim,
+      kRenderPipelineXOffset + DivCeil(frame_dimensions_.xsize_upsampled,
+                                       1 << channel_shifts_[0][channel].first),
+      kRenderPipelineXOffset +
+          DivCeil(frame_dimensions_.ysize_upsampled,
+                  1 << channel_shifts_[0][channel].second));
+}
+
+std::vector<std::pair<ImageF*, Rect>> SimpleRenderPipeline::PrepareBuffers(
+    size_t group_id, size_t thread_id) {
+  std::vector<std::pair<ImageF*, Rect>> ret;
+  for (size_t c = 0; c < channel_data_.size(); c++) {
+    ret.emplace_back(&channel_data_[c], MakeChannelRect(group_id, c));
+  }
+  return ret;
+}
+
+void SimpleRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) {
+  for (size_t c = 0; c < channel_data_.size(); c++) {
+    Rect r = MakeChannelRect(group_id, c);
+    (void)r;
+    JXL_CHECK_PLANE_INITIALIZED(channel_data_[c], r, c);
+  }
+
+  if (PassesWithAllInput() <= processed_passes_) return;
+  processed_passes_++;
+
+  for (size_t stage_id = 0; stage_id < stages_.size(); stage_id++) {
+    const auto& stage = stages_[stage_id];
+    // Prepare buffers for kInOut channels.
+    std::vector<ImageF> new_channels(channel_data_.size());
+    std::vector<ImageF*> output_channels(channel_data_.size());
+
+    std::vector<std::pair<size_t, size_t>> input_sizes(channel_data_.size());
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      input_sizes[c] =
+          std::make_pair(channel_data_[c].xsize() - kRenderPipelineXOffset * 2,
+                         channel_data_[c].ysize() - kRenderPipelineXOffset * 2);
+    }
+
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      // Ensure that the newly allocated channels are large enough to avoid
+      // problems with padding.
+      new_channels[c] =
+          ImageF(frame_dimensions_.xsize_upsampled_padded +
+                     kRenderPipelineXOffset * 2 + hwy::kMaxVectorSize * 8,
+                 frame_dimensions_.ysize_upsampled_padded +
+                     kRenderPipelineXOffset * 2);
+      new_channels[c].ShrinkTo(
+          (input_sizes[c].first << stage->settings_.shift_x) +
+              kRenderPipelineXOffset * 2,
+          (input_sizes[c].second << stage->settings_.shift_y) +
+              kRenderPipelineXOffset * 2);
+      output_channels[c] = &new_channels[c];
+    }
+
+    auto get_row = [&](size_t c, int64_t y) {
+      return channel_data_[c].Row(kRenderPipelineXOffset + y) +
+             kRenderPipelineXOffset;
+    };
+
+    // Add mirrored pixes to all kInOut channels.
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      // Horizontal mirroring.
+      for (size_t y = 0; y < input_sizes[c].second; y++) {
+        float* row = get_row(c, y);
+        for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+          *(row - ix - 1) = row[Mirror(-ssize_t(ix) - 1, input_sizes[c].first)];
+        }
+        for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+          *(row + ix + input_sizes[c].first) =
+              row[Mirror(ix + input_sizes[c].first, input_sizes[c].first)];
+        }
+      }
+      // Vertical mirroring.
+      for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+        memcpy(get_row(c, -y - 1) - stage->settings_.border_x,
+               get_row(c, Mirror(-ssize_t(y) - 1, input_sizes[c].second)) -
+                   stage->settings_.border_x,
+               sizeof(float) *
+                   (input_sizes[c].first + 2 * stage->settings_.border_x));
+      }
+      for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+        memcpy(
+            get_row(c, input_sizes[c].second + y) - stage->settings_.border_x,
+            get_row(c,
+                    Mirror(input_sizes[c].second + y, input_sizes[c].second)) -
+                stage->settings_.border_x,
+            sizeof(float) *
+                (input_sizes[c].first + 2 * stage->settings_.border_x));
+      }
+    }
+
+    size_t ysize = 0;
+    size_t xsize = 0;
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+        continue;
+      }
+      ysize = std::max(input_sizes[c].second, ysize);
+      xsize = std::max(input_sizes[c].first, xsize);
+    }
+
+    JXL_ASSERT(ysize != 0);
+    JXL_ASSERT(xsize != 0);
+
+    RenderPipelineStage::RowInfo input_rows(channel_data_.size());
+    RenderPipelineStage::RowInfo output_rows(channel_data_.size());
+
+    // Run the pipeline.
+    {
+      stage->SetInputSizes(input_sizes);
+      int border_y = stage->settings_.border_y;
+      for (size_t y = 0; y < ysize; y++) {
+        // Prepare input rows.
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+            continue;
+          }
+          input_rows[c].resize(2 * border_y + 1);
+          for (int iy = -border_y; iy <= border_y; iy++) {
+            input_rows[c][iy + border_y] =
+                channel_data_[c].Row(y + kRenderPipelineXOffset + iy);
+          }
+        }
+        // Prepare output rows.
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          if (!output_channels[c]) continue;
+          output_rows[c].resize(1 << stage->settings_.shift_y);
+          for (size_t iy = 0; iy < output_rows[c].size(); iy++) {
+            output_rows[c][iy] = output_channels[c]->Row(
+                (y << stage->settings_.shift_y) + iy + kRenderPipelineXOffset);
+          }
+        }
+        stage->ProcessRow(input_rows, output_rows, /*xextra=*/0, xsize,
+                          /*xpos=*/0, y, thread_id);
+      }
+    }
+
+    // Move new channels to current channels.
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      channel_data_[c] = std::move(new_channels[c]);
+    }
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      size_t next_stage = std::min(stage_id + 1, channel_shifts_.size() - 1);
+      size_t xsize = DivCeil(frame_dimensions_.xsize_upsampled,
+                             1 << channel_shifts_[next_stage][c].first);
+      size_t ysize = DivCeil(frame_dimensions_.ysize_upsampled,
+                             1 << channel_shifts_[next_stage][c].second);
+      channel_data_[c].ShrinkTo(xsize + 2 * kRenderPipelineXOffset,
+                                ysize + 2 * kRenderPipelineXOffset);
+      JXL_CHECK_PLANE_INITIALIZED(
+          channel_data_[c],
+          Rect(kRenderPipelineXOffset, kRenderPipelineXOffset, xsize, ysize),
+          c);
+    }
+
+    if (stage->SwitchToImageDimensions()) {
+      size_t image_xsize, image_ysize;
+      FrameOrigin frame_origin;
+      stage->GetImageDimensions(&image_xsize, &image_ysize, &frame_origin);
+      frame_dimensions_.Set(image_xsize, image_ysize, 0, 0, 0, false, 1);
+      std::vector<ImageF> old_channels = std::move(channel_data_);
+      channel_data_.clear();
+      channel_data_.reserve(old_channels.size());
+      for (size_t c = 0; c < old_channels.size(); c++) {
+        channel_data_.emplace_back(2 * kRenderPipelineXOffset + image_xsize,
+                                   2 * kRenderPipelineXOffset + image_ysize);
+      }
+      for (size_t y = 0; y < image_ysize; ++y) {
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          output_rows[c].resize(1);
+          output_rows[c][0] = channel_data_[c].Row(kRenderPipelineXOffset + y);
+        }
+        // TODO(sboukortt): consider doing this only on the parts of the
+        // background that won't be occluded.
+        stage->ProcessPaddingRow(output_rows, image_xsize, 0, y);
+      }
+      ssize_t x0 = frame_origin.x0;
+      ssize_t y0 = frame_origin.y0;
+      size_t x0_fg = 0;
+      size_t y0_fg = 0;
+      if (x0 < 0) {
+        xsize += x0;
+        x0_fg -= x0;
+        x0 = 0;
+      }
+      if (x0 + xsize > image_xsize) {
+        xsize = image_xsize - x0;
+      }
+      if (y0 < 0) {
+        ysize += y0;
+        y0_fg -= x0;
+        y0 = 0;
+      }
+      if (y0 + ysize > image_ysize) {
+        ysize = image_ysize - y0;
+      }
+      const Rect rect_fg_relative_to_image =
+          Rect(x0, y0, xsize, ysize)
+              .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+      const Rect rect_fg =
+          Rect(x0_fg, y0_fg, xsize, ysize)
+              .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+      for (size_t c = 0; c < channel_data_.size(); c++) {
+        CopyImageTo(rect_fg, old_channels[c], rect_fg_relative_to_image,
+                    &channel_data_[c]);
+      }
+    }
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
new file mode 100644
index 0000000000..10f4505912
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A RenderPipeline that is "obviously correct"; it may use potentially large
+// amounts of memory and be slow. It is intended to be used mostly for testing
+// purposes.
+class SimpleRenderPipeline : public RenderPipeline {
+  std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) override;
+
+  void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+  void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+  // Full frame buffers. Both X and Y dimensions are padded by
+  // kRenderPipelineXOffset.
+  std::vector<ImageF> channel_data_;
+  size_t processed_passes_ = 0;
+
+ private:
+  Rect MakeChannelRect(size_t group_id, size_t channel);
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
new file mode 100644
index 0000000000..b6668c5625
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
@@ -0,0 +1,247 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_blending.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_blending.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/blending.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class BlendingStage : public RenderPipelineStage {
+ public:
+  explicit BlendingStage(const PassesDecoderState* dec_state,
+                         const ColorEncoding& frame_color_encoding)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        state_(*dec_state->shared) {
+    image_xsize_ = state_.frame_header.nonserialized_metadata->xsize();
+    image_ysize_ = state_.frame_header.nonserialized_metadata->ysize();
+    extra_channel_info_ =
+        &state_.frame_header.nonserialized_metadata->m.extra_channel_info;
+    info_ = state_.frame_header.blending_info;
+    const std::vector<BlendingInfo>& ec_info =
+        state_.frame_header.extra_channel_blending_info;
+    const ImageBundle& bg = state_.reference_frames[info_.source].frame;
+    bg_ = &bg;
+    if (bg.xsize() == 0 || bg.ysize() == 0) {
+      zeroes_.resize(image_xsize_, 0.f);
+    } else if (state_.reference_frames[info_.source].ib_is_in_xyb) {
+      initialized_ = JXL_FAILURE(
+          "Trying to blend XYB reference frame %i and non-XYB frame",
+          info_.source);
+      return;
+    } else if (std::any_of(ec_info.begin(), ec_info.end(),
+                           [this](const BlendingInfo& info) {
+                             const ImageBundle& bg =
+                                 state_.reference_frames[info.source].frame;
+                             return bg.xsize() == 0 || bg.ysize() == 0;
+                           })) {
+      zeroes_.resize(image_xsize_, 0.f);
+    }
+
+    auto verify_bg_size = [&](const ImageBundle& bg) -> Status {
+      if (bg.xsize() != 0 && bg.ysize() != 0 &&
+          (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ ||
+           bg.origin.x0 != 0 || bg.origin.y0 != 0)) {
+        return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS
+                           " crop as a background",
+                           bg.xsize(), bg.ysize());
+      }
+      return true;
+    };
+
+    Status ok = verify_bg_size(bg);
+    for (const auto& info : ec_info) {
+      const ImageBundle& bg = state_.reference_frames[info.source].frame;
+      if (!!ok) ok = verify_bg_size(bg);
+    }
+    if (!ok) {
+      initialized_ = ok;
+      return;
+    }
+
+    if (state_.metadata->m.xyb_encoded) {
+      if (!dec_state->output_encoding_info.color_encoding_is_original) {
+        initialized_ = JXL_FAILURE("Blending in unsupported color space");
+        return;
+      }
+    }
+
+    blending_info_.resize(ec_info.size() + 1);
+    auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) {
+      pb->alpha_channel = info.alpha_channel;
+      pb->clamp = info.clamp;
+      switch (info.mode) {
+        case BlendMode::kReplace: {
+          pb->mode = PatchBlendMode::kReplace;
+          break;
+        }
+        case BlendMode::kAdd: {
+          pb->mode = PatchBlendMode::kAdd;
+          break;
+        }
+        case BlendMode::kMul: {
+          pb->mode = PatchBlendMode::kMul;
+          break;
+        }
+        case BlendMode::kBlend: {
+          pb->mode = PatchBlendMode::kBlendAbove;
+          break;
+        }
+        case BlendMode::kAlphaWeightedAdd: {
+          pb->mode = PatchBlendMode::kAlphaWeightedAddAbove;
+          break;
+        }
+        default: {
+          JXL_ABORT("Invalid blend mode");  // should have failed to decode
+        }
+      }
+    };
+    make_blending(info_, &blending_info_[0]);
+    for (size_t i = 0; i < ec_info.size(); i++) {
+      make_blending(ec_info[i], &blending_info_[1 + i]);
+    }
+  }
+
+  Status IsInitialized() const override { return initialized_; }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Blend");
+    JXL_ASSERT(initialized_);
+    const FrameOrigin& frame_origin = state_.frame_header.frame_origin;
+    ssize_t bg_xpos = frame_origin.x0 + static_cast<ssize_t>(xpos);
+    ssize_t bg_ypos = frame_origin.y0 + static_cast<ssize_t>(ypos);
+    int offset = 0;
+    if (bg_xpos + static_cast<ssize_t>(xsize) <= 0 ||
+        frame_origin.x0 >= static_cast<ssize_t>(image_xsize_) || bg_ypos < 0 ||
+        bg_ypos >= static_cast<ssize_t>(image_ysize_)) {
+      return;
+    }
+    if (bg_xpos < 0) {
+      offset -= bg_xpos;
+      xsize += bg_xpos;
+      bg_xpos = 0;
+    }
+    if (bg_xpos + xsize > image_xsize_) {
+      xsize =
+          std::max<ssize_t>(0, static_cast<ssize_t>(image_xsize_) - bg_xpos);
+    }
+    std::vector<const float*> bg_row_ptrs_(input_rows.size());
+    std::vector<float*> fg_row_ptrs_(input_rows.size());
+    size_t num_c = std::min(input_rows.size(), extra_channel_info_->size() + 3);
+    for (size_t c = 0; c < num_c; ++c) {
+      fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset;
+      if (c < 3) {
+        bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0
+                              ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos
+                              : zeroes_.data();
+      } else {
+        const ImageBundle& ec_bg =
+            state_
+                .reference_frames[state_.frame_header
+                                      .extra_channel_blending_info[c - 3]
+                                      .source]
+                .frame;
+        bg_row_ptrs_[c] =
+            ec_bg.xsize() != 0 && ec_bg.ysize() != 0
+                ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos
+                : zeroes_.data();
+      }
+    }
+    PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(),
+                    fg_row_ptrs_.data(), 0, xsize, blending_info_[0],
+                    blending_info_.data() + 1, *extra_channel_info_);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInPlace;
+  }
+
+  bool SwitchToImageDimensions() const override { return true; }
+
+  void GetImageDimensions(size_t* xsize, size_t* ysize,
+                          FrameOrigin* frame_origin) const override {
+    *xsize = image_xsize_;
+    *ysize = image_ysize_;
+    *frame_origin = state_.frame_header.frame_origin;
+  }
+
+  void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, size_t xpos,
+                         size_t ypos) const override {
+    if (bg_->xsize() == 0 || bg_->ysize() == 0) {
+      for (size_t c = 0; c < 3; ++c) {
+        memset(GetInputRow(output_rows, c, 0), 0, xsize * sizeof(float));
+      }
+    } else {
+      for (size_t c = 0; c < 3; ++c) {
+        memcpy(GetInputRow(output_rows, c, 0),
+               bg_->color().ConstPlaneRow(c, ypos) + xpos,
+               xsize * sizeof(float));
+      }
+    }
+    for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) {
+      const ImageBundle& ec_bg =
+          state_
+              .reference_frames
+                  [state_.frame_header.extra_channel_blending_info[ec].source]
+              .frame;
+      if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) {
+        memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float));
+      } else {
+        memcpy(GetInputRow(output_rows, 3 + ec, 0),
+               ec_bg.extra_channels()[ec].ConstRow(ypos) + xpos,
+               xsize * sizeof(float));
+      }
+    }
+  }
+
+  const char* GetName() const override { return "Blending"; }
+
+ private:
+  const PassesSharedState& state_;
+  BlendingInfo info_;
+  const ImageBundle* bg_;
+  Status initialized_ = true;
+  size_t image_xsize_;
+  size_t image_ysize_;
+  std::vector<PatchBlending> blending_info_;
+  const std::vector<ExtraChannelInfo>* extra_channel_info_;
+  std::vector<float> zeroes_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding) {
+  return jxl::make_unique<BlendingStage>(dec_state, frame_color_encoding);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetBlendingStage);
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding) {
+  return HWY_DYNAMIC_DISPATCH(GetBlendingStage)(dec_state,
+                                                frame_color_encoding);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
new file mode 100644
index 0000000000..c8db7490cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+
+#include <utility>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Applies blending if applicable.
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
new file mode 100644
index 0000000000..9b73ee91f1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_chroma_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class HorizontalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit HorizontalChromaUpsamplingStage(size_t channel)
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(
+            /*shift=*/1, /*border=*/1)),
+        c_(channel) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("HorizontalChromaUpsampling");
+    HWY_FULL(float) df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    auto threefour = Set(df, 0.75f);
+    auto onefour = Set(df, 0.25f);
+    const float* row_in = GetInputRow(input_rows, c_, 0);
+    float* row_out = GetOutputRow(output_rows, c_, 0);
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      auto current = Mul(LoadU(df, row_in + x), threefour);
+      auto prev = LoadU(df, row_in + x - 1);
+      auto next = LoadU(df, row_in + x + 1);
+      auto left = MulAdd(onefour, prev, current);
+      auto right = MulAdd(onefour, next, current);
+      StoreInterleaved(df, left, right, row_out + x * 2);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "HChromaUps"; }
+
+ private:
+  size_t c_;
+};
+
+class VerticalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit VerticalChromaUpsamplingStage(size_t channel)
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(
+            /*shift=*/1, /*border=*/1)),
+        c_(channel) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("VerticalChromaUpsampling");
+    HWY_FULL(float) df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    auto threefour = Set(df, 0.75f);
+    auto onefour = Set(df, 0.25f);
+    const float* row_top = GetInputRow(input_rows, c_, -1);
+    const float* row_mid = GetInputRow(input_rows, c_, 0);
+    const float* row_bot = GetInputRow(input_rows, c_, 1);
+    float* row_out0 = GetOutputRow(output_rows, c_, 0);
+    float* row_out1 = GetOutputRow(output_rows, c_, 1);
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      auto it = LoadU(df, row_top + x);
+      auto im = LoadU(df, row_mid + x);
+      auto ib = LoadU(df, row_bot + x);
+      auto im_scaled = Mul(im, threefour);
+      Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+      Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "VChromaUps"; }
+
+ private:
+  size_t c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal) {
+  if (horizontal) {
+    return jxl::make_unique<HorizontalChromaUpsamplingStage>(channel);
+  } else {
+    return jxl::make_unique<VerticalChromaUpsamplingStage>(channel);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetChromaUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal) {
+  return HWY_DYNAMIC_DISPATCH(GetChromaUpsamplingStage)(channel, horizontal);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
new file mode 100644
index 0000000000..b8bfc15f5f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies simple upsampling, either horizontal or vertical, to the given
+// channel.
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
new file mode 100644
index 0000000000..d59c497843
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
@@ -0,0 +1,524 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_epf.h"
+
+#include "lib/jxl/epf.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_epf.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+// TODO(veluca): In principle, vectors could be not capped, if we want to deal
+// with having two different sigma values in a single vector.
+using DF = HWY_CAPPED(float, 8);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::VFromD;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+JXL_INLINE Vec<DF> Weight(Vec<DF> sad, Vec<DF> inv_sigma, Vec<DF> thres) {
+  auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f));
+  return ZeroIfNegative(v);
+}
+
+// 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 plus-shaped). So this makes
+// this filter a 7x7 filter.
+class EPF0Stage : public RenderPipelineStage {
+ public:
+  EPF0Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/3)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][7], ssize_t x,
+                           Vec<DF> sad, Vec<DF> inv_sigma,
+                           Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+                           Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][3 + row] + x)
+                      : LoadU(DF(), rows[0][3 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][3 + row] + x)
+                      : LoadU(DF(), rows[1][3 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][3 + row] + x)
+                      : LoadU(DF(), rows[2][3 + row] + x);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+
+    using V = decltype(Zero(df));
+    V t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA, tB;
+    V* sads[12] = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA, &tB};
+
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = lf_.epf_pass0_sigma_scale * 1.65;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+    float* JXL_RESTRICT rows[3][7];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 7; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 3);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][3 + 0] + x);
+          StoreU(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+      for (size_t i = 0; i < 12; i++) *sads[i] = Zero(df);
+      constexpr std::array<int, 2> sads_off[12] = {
+          {{-2, 0}}, {{-1, -1}}, {{-1, 0}}, {{-1, 1}}, {{0, -2}}, {{0, -1}},
+          {{0, 1}},  {{0, 2}},   {{1, -1}}, {{1, 0}},  {{1, 1}},  {{2, 0}},
+      };
+
+      // compute sads
+      // TODO(veluca): consider unrolling and optimizing this.
+      for (size_t c = 0; c < 3; c++) {
+        auto scale = Set(df, lf_.epf_channel_scale[c]);
+        for (size_t i = 0; i < 12; i++) {
+          auto sad = Zero(df);
+          constexpr std::array<int, 2> plus_off[] = {
+              {{0, 0}}, {{-1, 0}}, {{0, -1}}, {{1, 0}}, {{0, 1}}};
+          for (size_t j = 0; j < 5; j++) {
+            const auto r11 =
+                LoadU(df, rows[c][3 + plus_off[j][0]] + x + plus_off[j][1]);
+            const auto c11 =
+                LoadU(df, rows[c][3 + sads_off[i][0] + plus_off[j][0]] + x +
+                              sads_off[i][1] + plus_off[j][1]);
+            sad = Add(sad, AbsDiff(r11, c11));
+          }
+          *sads[i] = MulAdd(sad, scale, *sads[i]);
+        }
+      }
+      const auto x_cc = Load(df, rows[0][3 + 0] + x);
+      const auto y_cc = Load(df, rows[1][3 + 0] + x);
+      const auto b_cc = Load(df, rows[2][3 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      for (size_t i = 0; i < 12; i++) {
+        AddPixel</*aligned=*/false>(/*row=*/sads_off[i][0], rows,
+                                    x + sads_off[i][1], *sads[i], inv_sigma, &X,
+                                    &Y, &B, &w);
+      }
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      StoreU(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      StoreU(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      StoreU(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF0"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 plus-shaped). So this
+// makes this filter a 5x5 filter.
+class EPF1Stage : public RenderPipelineStage {
+ public:
+  EPF1Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/2)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][5], ssize_t x,
+                           Vec<DF> sad, Vec<DF> inv_sigma,
+                           Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+                           Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][2 + row] + x)
+                      : LoadU(DF(), rows[0][2 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][2 + row] + x)
+                      : LoadU(DF(), rows[1][2 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][2 + row] + x)
+                      : LoadU(DF(), rows[2][2 + row] + x);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = 1.65f;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+
+    float* JXL_RESTRICT rows[3][5];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 5; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 2);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][2 + 0] + x);
+          Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+      auto sad0 = Zero(df);
+      auto sad1 = Zero(df);
+      auto sad2 = Zero(df);
+      auto sad3 = Zero(df);
+
+      // compute sads
+      for (size_t c = 0; c < 3; c++) {
+        // center px = 22, px above = 21
+        auto t = Undefined(df);
+
+        const auto p20 = Load(df, rows[c][2 + -2] + x);
+        const auto p21 = Load(df, rows[c][2 + -1] + x);
+        auto sad0c = AbsDiff(p20, p21);  // SAD 2, 1
+
+        const auto p11 = LoadU(df, rows[c][2 + -1] + x - 1);
+        auto sad1c = AbsDiff(p11, p21);  // SAD 1, 2
+
+        const auto p31 = LoadU(df, rows[c][2 + -1] + x + 1);
+        auto sad2c = AbsDiff(p31, p21);  // SAD 3, 2
+
+        const auto p02 = LoadU(df, rows[c][2 + 0] + x - 2);
+        const auto p12 = LoadU(df, rows[c][2 + 0] + x - 1);
+        sad1c = Add(sad1c, AbsDiff(p02, p12));  // SAD 1, 2
+        sad0c = Add(sad0c, AbsDiff(p11, p12));  // SAD 2, 1
+
+        const auto p22 = LoadU(df, rows[c][2 + 0] + x);
+        t = AbsDiff(p12, p22);
+        sad1c = Add(sad1c, t);  // SAD 1, 2
+        sad2c = Add(sad2c, t);  // SAD 3, 2
+        t = AbsDiff(p22, p21);
+        auto sad3c = t;  // SAD 2, 3
+        sad0c = Add(sad0c, t);  // SAD 2, 1
+
+        const auto p32 = LoadU(df, rows[c][2 + 0] + x + 1);
+        sad0c = Add(sad0c, AbsDiff(p31, p32));  // SAD 2, 1
+        t = AbsDiff(p22, p32);
+        sad1c = Add(sad1c, t);  // SAD 1, 2
+        sad2c = Add(sad2c, t);  // SAD 3, 2
+
+        const auto p42 = LoadU(df, rows[c][2 + 0] + x + 2);
+        sad2c = Add(sad2c, AbsDiff(p42, p32));  // SAD 3, 2
+
+        const auto p13 = LoadU(df, rows[c][2 + 1] + x - 1);
+        sad3c = Add(sad3c, AbsDiff(p13, p12));  // SAD 2, 3
+
+        const auto p23 = Load(df, rows[c][2 + 1] + x);
+        t = AbsDiff(p22, p23);
+        sad0c = Add(sad0c, t);                  // SAD 2, 1
+        sad3c = Add(sad3c, t);                  // SAD 2, 3
+        sad1c = Add(sad1c, AbsDiff(p13, p23));  // SAD 1, 2
+
+        const auto p33 = LoadU(df, rows[c][2 + 1] + x + 1);
+        sad2c = Add(sad2c, AbsDiff(p33, p23));  // SAD 3, 2
+        sad3c = Add(sad3c, AbsDiff(p33, p32));  // SAD 2, 3
+
+        const auto p24 = Load(df, rows[c][2 + 2] + x);
+        sad3c = Add(sad3c, AbsDiff(p24, p23));  // SAD 2, 3
+
+        auto scale = Set(df, lf_.epf_channel_scale[c]);
+        sad0 = MulAdd(sad0c, scale, sad0);
+        sad1 = MulAdd(sad1c, scale, sad1);
+        sad2 = MulAdd(sad2c, scale, sad2);
+        sad3 = MulAdd(sad3c, scale, sad3);
+      }
+      const auto x_cc = Load(df, rows[0][2 + 0] + x);
+      const auto y_cc = Load(df, rows[1][2 + 0] + x);
+      const auto b_cc = Load(df, rows[2][2 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      // Top row
+      AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, sad0, inv_sigma, &X, &Y,
+                                 &B, &w);
+      // Center
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, sad1, inv_sigma, &X,
+                                  &Y, &B, &w);
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, sad2, inv_sigma, &X,
+                                  &Y, &B, &w);
+      // Bottom
+      AddPixel</*aligned=*/true>(/*row=*/1, rows, x, sad3, inv_sigma, &X, &Y,
+                                 &B, &w);
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF1"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 1 SAD per pixel. So this makes this filter a 3x3
+// filter.
+class EPF2Stage : public RenderPipelineStage {
+ public:
+  EPF2Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/1)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][3], ssize_t x,
+                           Vec<DF> rx, Vec<DF> ry, Vec<DF> rb,
+                           Vec<DF> inv_sigma, Vec<DF>* JXL_RESTRICT X,
+                           Vec<DF>* JXL_RESTRICT Y, Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][1 + row] + x)
+                      : LoadU(DF(), rows[0][1 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][1 + row] + x)
+                      : LoadU(DF(), rows[1][1 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][1 + row] + x)
+                      : LoadU(DF(), rows[2][1 + row] + x);
+
+    auto sad = Mul(AbsDiff(cx, rx), Set(DF(), lf_.epf_channel_scale[0]));
+    sad = MulAdd(AbsDiff(cy, ry), Set(DF(), lf_.epf_channel_scale[1]), sad);
+    sad = MulAdd(AbsDiff(cb, rb), Set(DF(), lf_.epf_channel_scale[2]), sad);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass2_zeroflush));
+
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = lf_.epf_pass2_sigma_scale * 1.65;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+
+    float* JXL_RESTRICT rows[3][3];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 3; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 1);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][1 + 0] + x);
+          Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+      const auto x_cc = Load(df, rows[0][1 + 0] + x);
+      const auto y_cc = Load(df, rows[1][1 + 0] + x);
+      const auto b_cc = Load(df, rows[2][1 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      // Top row
+      AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, x_cc, y_cc, b_cc,
+                                 inv_sigma, &X, &Y, &B, &w);
+      // Center
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc,
+                                  inv_sigma, &X, &Y, &B, &w);
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc,
+                                  inv_sigma, &X, &Y, &B, &w);
+      // Bottom
+      AddPixel</*aligned=*/true>(/*row=*/1, rows, x, x_cc, y_cc, b_cc,
+                                 inv_sigma, &X, &Y, &B, &w);
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF2"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage0(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF0Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage1(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF1Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage2(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF2Stage>(lf, sigma);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetEPFStage0);
+HWY_EXPORT(GetEPFStage1);
+HWY_EXPORT(GetEPFStage2);
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+                                                 const ImageF& sigma,
+                                                 size_t epf_stage) {
+  JXL_ASSERT(lf.epf_iters != 0);
+  switch (epf_stage) {
+    case 0:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage0)(lf, sigma);
+    case 1:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage1)(lf, sigma);
+    case 2:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma);
+    default:
+      JXL_ABORT("Invalid EPF stage");
+  }
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
new file mode 100644
index 0000000000..c9d0d0c785
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies the `epf_stage`-th EPF step with the given settings and `sigma`.
+// `sigma` will be accessed with an offset of (kSigmaPadding, kSigmaPadding),
+// and should have (kSigmaBorder, kSigmaBorder) mirrored sigma values available
+// around the main image. See also filters.(h|cc)
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+                                                 const ImageF& sigma,
+                                                 size_t epf_stage);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
new file mode 100644
index 0000000000..c7b22c663b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
@@ -0,0 +1,191 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_from_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_from_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+  explicit PerChannelOp(Op op) : op(op) {}
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    *r = op.Transform(d, *r);
+    *g = op.Transform(d, *g);
+    *b = op.Transform(d, *b);
+  }
+
+  Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+  return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return linear;
+  }
+};
+
+struct OpRgb {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+#if JXL_HIGH_PRECISION
+    return TF_SRGB().EncodedFromDisplay(d, linear);
+#else
+    return FastLinearToSRGB(d, linear);
+#endif
+  }
+};
+
+struct OpPq {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return TF_PQ().EncodedFromDisplay(d, linear);
+  }
+};
+
+struct OpHlg {
+  explicit OpHlg(const float luminances[3], const float intensity_target)
+      : hlg_ootf_(HlgOOTF::ToSceneLight(/*display_luminance=*/intensity_target,
+                                        luminances)) {}
+
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    hlg_ootf_.Apply(r, g, b);
+    *r = TF_HLG().EncodedFromDisplay(d, *r);
+    *g = TF_HLG().EncodedFromDisplay(d, *g);
+    *b = TF_HLG().EncodedFromDisplay(d, *b);
+  }
+  HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return TF_709().EncodedFromDisplay(d, linear);
+  }
+};
+
+struct OpGamma {
+  const float inverse_gamma;
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return IfThenZeroElse(Le(linear, Set(d, 1e-5f)),
+                          FastPowf(d, linear, Set(d, inverse_gamma)));
+  }
+};
+
+template <typename Op>
+class FromLinearStage : public RenderPipelineStage {
+ public:
+  explicit FromLinearStage(Op op)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        op_(std::move(op)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("FromLinear");
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      op_.Transform(d, &r, &g, &b);
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "FromLinear"; }
+
+ private:
+  Op op_;
+};
+
+template <typename Op>
+std::unique_ptr<FromLinearStage<Op>> MakeFromLinearStage(Op&& op) {
+  return jxl::make_unique<FromLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpLinear()));
+  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpRgb()));
+  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpPq()));
+  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+    return MakeFromLinearStage(
+        OpHlg(output_encoding_info.luminances,
+              output_encoding_info.desired_intensity_target));
+  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+    return MakeFromLinearStage(MakePerChannelOp(Op709()));
+  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
+             output_encoding_info.color_encoding.tf.IsDCI()) {
+    return MakeFromLinearStage(
+        MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma}));
+  } else {
+    // This is a programming error.
+    JXL_ABORT("Invalid target encoding");
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetFromLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetFromLinearStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
new file mode 100644
index 0000000000..548ab50b8c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from linear to the specified output encoding.
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
new file mode 100644
index 0000000000..fc90acb476
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
@@ -0,0 +1,122 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_gaborish.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_gaborish.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class GaborishStage : public RenderPipelineStage {
+ public:
+  explicit GaborishStage(const LoopFilter& lf)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/1)) {
+    weights_[0] = 1;
+    weights_[1] = lf.gab_x_weight1;
+    weights_[2] = lf.gab_x_weight2;
+    weights_[3] = 1;
+    weights_[4] = lf.gab_y_weight1;
+    weights_[5] = lf.gab_y_weight2;
+    weights_[6] = 1;
+    weights_[7] = lf.gab_b_weight1;
+    weights_[8] = lf.gab_b_weight2;
+    // Normalize
+    for (size_t c = 0; c < 3; c++) {
+      const float div =
+          weights_[3 * c] + 4 * (weights_[3 * c + 1] + weights_[3 * c + 2]);
+      const float mul = 1.0f / div;
+      weights_[3 * c] *= mul;
+      weights_[3 * c + 1] *= mul;
+      weights_[3 * c + 2] *= mul;
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Gaborish");
+
+    const HWY_FULL(float) d;
+    for (size_t c = 0; c < 3; c++) {
+      float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1);
+      float* JXL_RESTRICT row_m = GetInputRow(input_rows, c, 0);
+      float* JXL_RESTRICT row_b = GetInputRow(input_rows, c, 1);
+      float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+      const auto w0 = Set(d, weights_[3 * c + 0]);
+      const auto w1 = Set(d, weights_[3 * c + 1]);
+      const auto w2 = Set(d, weights_[3 * c + 2]);
+// Group data need only be aligned to a block; for >=512 bit vectors, this may
+// result in unaligned loads.
+#if HWY_CAP_GE512
+#define LoadMaybeU LoadU
+#else
+#define LoadMaybeU Load
+#endif
+      // Since GetInputRow(input_rows, c, {-1, 0, 1}) is aligned, rounding
+      // xextra up to Lanes(d) doesn't access anything problematic.
+      for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+           x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto t = LoadMaybeU(d, row_t + x);
+        const auto tl = LoadU(d, row_t + x - 1);
+        const auto tr = LoadU(d, row_t + x + 1);
+        const auto m = LoadMaybeU(d, row_m + x);
+        const auto l = LoadU(d, row_m + x - 1);
+        const auto r = LoadU(d, row_m + x + 1);
+        const auto b = LoadMaybeU(d, row_b + x);
+        const auto bl = LoadU(d, row_b + x - 1);
+        const auto br = LoadU(d, row_b + x + 1);
+        const auto sum0 = m;
+        const auto sum1 = Add(Add(l, r), Add(t, b));
+        const auto sum2 = Add(Add(tl, tr), Add(bl, br));
+        auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, Mul(sum0, w0)));
+        Store(pixels, d, row_out + x);
+      }
+    }
+  }
+#undef LoadMaybeU
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Gab"; }
+
+ private:
+  float weights_[9];
+};
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+  return jxl::make_unique<GaborishStage>(lf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetGaborishStage);
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+  JXL_ASSERT(lf.gab == 1);
+  return HWY_DYNAMIC_DISPATCH(GetGaborishStage)(lf);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
new file mode 100644
index 0000000000..761800f668
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies decoder-side Gaborish with the given settings. `lf.gab` must be 1.
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
new file mode 100644
index 0000000000..187095cf61
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
@@ -0,0 +1,311 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_noise.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+using D = HWY_CAPPED(float, kBlockDim);
+using DI = hwy::HWY_NAMESPACE::Rebind<int32_t, D>;
+using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
+
+// [0, max_value]
+template <class D, class V>
+static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) {
+  const auto clamped = Min(x, max_value);
+  return ZeroIfNegative(clamped);
+}
+
+// x is in [0+delta, 1+delta], delta ~= 0.06
+template <class StrengthEval>
+typename StrengthEval::V NoiseStrength(const StrengthEval& eval,
+                                       const typename StrengthEval::V x) {
+  return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f));
+}
+
+// TODO(veluca): SIMD-fy.
+class StrengthEvalLut {
+ public:
+  using V = Vec<D>;
+
+  explicit StrengthEvalLut(const NoiseParams& noise_params)
+#if HWY_TARGET == HWY_SCALAR
+      : noise_params_(noise_params)
+#endif
+  {
+#if HWY_TARGET != HWY_SCALAR
+    uint32_t lut[8];
+    memcpy(lut, noise_params.lut, sizeof(lut));
+    for (size_t i = 0; i < 8; i++) {
+      low16_lut[2 * i] = (lut[i] >> 0) & 0xFF;
+      low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF;
+      high16_lut[2 * i] = (lut[i] >> 16) & 0xFF;
+      high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF;
+    }
+#endif
+  }
+
+  V operator()(const V vx) const {
+    constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2;
+    auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale)));
+    auto floor_x = Floor(scaled_vx);
+    auto frac_x = Sub(scaled_vx, floor_x);
+    floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale),
+                         floor_x);
+    frac_x =
+        IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x);
+    auto floor_x_int = ConvertTo(DI(), floor_x);
+#if HWY_TARGET == HWY_SCALAR
+    auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
+    auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]);
+#else
+    // Set each lane's bytes to {0, 0, 2x+1, 2x}.
+    auto floorx_indices_low =
+        Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100));
+    // Set each lane's bytes to {2x+1, 2x, 0, 0}.
+    auto floorx_indices_hi =
+        Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000));
+    // load LUT
+    auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut));
+    auto lowm = Set(DI(), 0xFFFF);
+    auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut));
+    auto him = Set(DI(), 0xFFFF0000);
+    // low = noise_params.lut[floor_x]
+    auto low =
+        BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+                        And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+    // hi = noise_params.lut[floor_x+1]
+    floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202));
+    floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000));
+    auto hi =
+        BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+                        And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+#endif
+    return MulAdd(Sub(hi, low), frac_x, low);
+  }
+
+ private:
+#if HWY_TARGET != HWY_SCALAR
+  // noise_params.lut transformed into two 16-bit lookup tables.
+  HWY_ALIGN uint8_t high16_lut[16];
+  HWY_ALIGN uint8_t low16_lut[16];
+#else
+  const NoiseParams& noise_params_;
+#endif
+};
+
+template <class D>
+void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r,
+                   const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor,
+                   const Vec<D> noise_strength_g, const Vec<D> noise_strength_r,
+                   float ytox, float ytob, float* JXL_RESTRICT out_x,
+                   float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) {
+  const auto kRGCorr = Set(d, 0.9921875f);   // 127/128
+  const auto kRGNCorr = Set(d, 0.0078125f);  // 1/128
+
+  const auto red_noise =
+      Mul(noise_strength_r,
+          MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor)));
+  const auto green_noise =
+      Mul(noise_strength_g,
+          MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor)));
+
+  auto vx = LoadU(d, out_x);
+  auto vy = LoadU(d, out_y);
+  auto vb = LoadU(d, out_b);
+
+  const auto rg_noise = Add(red_noise, green_noise);
+  vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx);
+  vy = Add(vy, rg_noise);
+  vb = MulAdd(Set(d, ytob), rg_noise, vb);
+
+  StoreU(vx, d, out_x);
+  StoreU(vy, d, out_y);
+  StoreU(vb, d, out_b);
+}
+
+class AddNoiseStage : public RenderPipelineStage {
+ public:
+  AddNoiseStage(const NoiseParams& noise_params,
+                const ColorCorrelationMap& cmap, size_t first_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/0)),
+        noise_params_(noise_params),
+        cmap_(cmap),
+        first_c_(first_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Noise apply");
+
+    if (!noise_params_.HasAny()) return;
+    const StrengthEvalLut noise_model(noise_params_);
+    D d;
+    const auto half = Set(d, 0.5f);
+
+    // With the prior subtract-random Laplacian approximation, rnd_* ranges were
+    // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the
+    // normalizer is half of what it was before (0.5).
+    const auto norm_const = Set(d, 0.22f);
+
+    float ytox = cmap_.YtoXRatio(0);
+    float ytob = cmap_.YtoBRatio(0);
+
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+
+    float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0);
+    const float* JXL_RESTRICT row_rnd_r =
+        GetInputRow(input_rows, first_c_ + 0, 0);
+    const float* JXL_RESTRICT row_rnd_g =
+        GetInputRow(input_rows, first_c_ + 1, 0);
+    const float* JXL_RESTRICT row_rnd_c =
+        GetInputRow(input_rows, first_c_ + 2, 0);
+    // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and
+    // shuffles are otherwise done on the data, so this is safe.
+    msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+    for (size_t x = 0; x < xsize_v; x += Lanes(d)) {
+      const auto vx = LoadU(d, row_x + x);
+      const auto vy = LoadU(d, row_y + x);
+      const auto in_g = Sub(vy, vx);
+      const auto in_r = Add(vy, vx);
+      const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half));
+      const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half));
+      const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const);
+      const auto addit_rnd_noise_green =
+          Mul(LoadU(d, row_rnd_g + x), norm_const);
+      const auto addit_rnd_noise_correlated =
+          Mul(LoadU(d, row_rnd_c + x), norm_const);
+      AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green,
+                    addit_rnd_noise_correlated, noise_strength_g,
+                    noise_strength_r, ytox, ytob, row_x + x, row_y + x,
+                    row_b + x);
+    }
+    msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c >= first_c_ ? RenderPipelineChannelMode::kInput
+           : c < 3       ? RenderPipelineChannelMode::kInPlace
+                         : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "AddNoise"; }
+
+ private:
+  const NoiseParams& noise_params_;
+  const ColorCorrelationMap& cmap_;
+  size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start) {
+  return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start);
+}
+
+class ConvolveNoiseStage : public RenderPipelineStage {
+ public:
+  explicit ConvolveNoiseStage(size_t first_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/2)),
+        first_c_(first_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Noise convolve");
+
+    const HWY_FULL(float) d;
+    for (size_t c = first_c_; c < first_c_ + 3; c++) {
+      float* JXL_RESTRICT rows[5];
+      for (size_t i = 0; i < 5; i++) {
+        rows[i] = GetInputRow(input_rows, c, i - 2);
+      }
+      float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+      for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+           x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto p00 = LoadU(d, rows[2] + x);
+        auto others = Zero(d);
+        // TODO(eustas): sum loaded values to reduce the calculation chain
+        for (ssize_t i = -2; i <= 2; i++) {
+          others = Add(others, LoadU(d, rows[0] + x + i));
+          others = Add(others, LoadU(d, rows[1] + x + i));
+          others = Add(others, LoadU(d, rows[3] + x + i));
+          others = Add(others, LoadU(d, rows[4] + x + i));
+        }
+        others = Add(others, LoadU(d, rows[2] + x - 2));
+        others = Add(others, LoadU(d, rows[2] + x - 1));
+        others = Add(others, LoadU(d, rows[2] + x + 1));
+        others = Add(others, LoadU(d, rows[2] + x + 2));
+        // 4 * (1 - box kernel)
+        auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84)));
+        StoreU(pixels, d, row_out + x);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c >= first_c_ ? RenderPipelineChannelMode::kInOut
+                         : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ConvNoise"; }
+
+ private:
+  size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start) {
+  return jxl::make_unique<ConvolveNoiseStage>(noise_c_start);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetAddNoiseStage);
+HWY_EXPORT(GetConvolveNoiseStage);
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start) {
+  return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap,
+                                                noise_c_start);
+}
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start) {
+  return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
new file mode 100644
index 0000000000..bd7797f991
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Adds noise to color channels.
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start);
+
+// Applies a 5x5 subtract-box-filter convolution to the noise input channels.
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
new file mode 100644
index 0000000000..527be03839
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_patches.h"
+
+namespace jxl {
+namespace {
+class PatchDictionaryStage : public RenderPipelineStage {
+ public:
+  PatchDictionaryStage(const PatchDictionary* patches, size_t num_channels)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        patches_(*patches),
+        num_channels_(num_channels) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("RenderPatches");
+    JXL_ASSERT(xpos == 0 || xpos >= xextra);
+    size_t x0 = xpos ? xpos - xextra : 0;
+    std::vector<float*> row_ptrs(num_channels_);
+    for (size_t i = 0; i < num_channels_; i++) {
+      row_ptrs[i] = GetInputRow(input_rows, i, 0) + x0 - xpos;
+    }
+    patches_.AddOneRow(row_ptrs.data(), ypos, x0, xsize + xextra + xpos - x0);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < num_channels_ ? RenderPipelineChannelMode::kInPlace
+                             : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Patches"; }
+
+ private:
+  const PatchDictionary& patches_;
+  const size_t num_channels_;
+};
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+    const PatchDictionary* patches, size_t num_channels) {
+  return jxl::make_unique<PatchDictionaryStage>(patches, num_channels);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
new file mode 100644
index 0000000000..b35abdc2eb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+
+#include <utility>
+
+#include "lib/jxl/patch_dictionary_internal.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Draws patches if applicable.
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+    const PatchDictionary* patches, size_t num_channels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
new file mode 100644
index 0000000000..d97d97e5f2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_splines.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_splines.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class SplineStage : public RenderPipelineStage {
+ public:
+  explicit SplineStage(const Splines* splines)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        splines_(*splines) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("RenderSplines");
+    float* row_x = GetInputRow(input_rows, 0, 0);
+    float* row_y = GetInputRow(input_rows, 1, 0);
+    float* row_b = GetInputRow(input_rows, 2, 0);
+    splines_.AddToRow(row_x, row_y, row_b, Rect(xpos, ypos, xsize, 1));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Splines"; }
+
+ private:
+  const Splines& splines_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+  return jxl::make_unique<SplineStage>(splines);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetSplineStage);
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+  return HWY_DYNAMIC_DISPATCH(GetSplineStage)(splines);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
new file mode 100644
index 0000000000..363af393ec
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Draws splines if applicable.
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
new file mode 100644
index 0000000000..d4f6152994
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_spot.h"
+
+namespace jxl {
+class SpotColorStage : public RenderPipelineStage {
+ public:
+  explicit SpotColorStage(size_t spot_c, const float* spot_color)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        spot_c_(spot_c),
+        spot_color_(spot_color) {
+    JXL_ASSERT(spot_c_ >= 3);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    // TODO(veluca): add SIMD.
+    PROFILER_ZONE("RenderSpotColors");
+    float scale = spot_color_[3];
+    for (size_t c = 0; c < 3; c++) {
+      float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0);
+      const float* JXL_RESTRICT s = GetInputRow(input_rows, spot_c_, 0);
+      for (ssize_t x = -xextra; x < ssize_t(xsize + xextra); x++) {
+        float mix = scale * s[x];
+        p[x] = mix * spot_color_[c] + (1.0f - mix) * p[x];
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3          ? RenderPipelineChannelMode::kInPlace
+           : c == spot_c_ ? RenderPipelineChannelMode::kInput
+                          : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Spot"; }
+
+ private:
+  size_t spot_c_;
+  const float* spot_color_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(
+    size_t spot_c, const float* spot_color) {
+  return jxl::make_unique<SpotColorStage>(spot_c, spot_color);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
new file mode 100644
index 0000000000..3e79c75823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Render the spot color channels.
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(size_t spot_c,
+                                                       const float* spot_color);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
new file mode 100644
index 0000000000..9f5b2b73dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
@@ -0,0 +1,202 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_to_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_to_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+  explicit PerChannelOp(Op op) : op(op) {}
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    *r = op.Transform(d, *r);
+    *g = op.Transform(d, *g);
+    *b = op.Transform(d, *b);
+  }
+
+  Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+  return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return encoded;
+  }
+};
+
+struct OpRgb {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_SRGB().DisplayFromEncoded(encoded);
+  }
+};
+
+struct OpPq {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_PQ().DisplayFromEncoded(d, encoded);
+  }
+};
+
+struct OpHlg {
+  explicit OpHlg(const float luminances[3], const float intensity_target)
+      : hlg_ootf_(HlgOOTF::FromSceneLight(
+            /*display_luminance=*/intensity_target, luminances)) {}
+
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    for (T* val : {r, g, b}) {
+      HWY_ALIGN float vals[MaxLanes(d)];
+      Store(*val, d, vals);
+      for (size_t i = 0; i < Lanes(d); ++i) {
+        vals[i] = TF_HLG().DisplayFromEncoded(vals[i]);
+      }
+      *val = Load(d, vals);
+    }
+    hlg_ootf_.Apply(r, g, b);
+  }
+  HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_709().DisplayFromEncoded(d, encoded);
+  }
+};
+
+struct OpGamma {
+  const float gamma;
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return IfThenZeroElse(Le(encoded, Set(d, 1e-5f)),
+                          FastPowf(d, encoded, Set(d, gamma)));
+  }
+};
+
+struct OpInvalid {
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {}
+};
+
+template <typename Op>
+class ToLinearStage : public RenderPipelineStage {
+ public:
+  explicit ToLinearStage(Op op)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        op_(std::move(op)) {}
+
+  explicit ToLinearStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings()), valid_(false) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("ToLinear");
+
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      op_.Transform(d, &r, &g, &b);
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ToLinear"; }
+
+ private:
+  Status IsInitialized() const override { return valid_; }
+
+  Op op_;
+  bool valid_ = true;
+};
+
+template <typename Op>
+std::unique_ptr<ToLinearStage<Op>> MakeToLinearStage(Op&& op) {
+  return jxl::make_unique<ToLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+    return MakeToLinearStage(MakePerChannelOp(OpLinear()));
+  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+    return MakeToLinearStage(MakePerChannelOp(OpRgb()));
+  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
+    return MakeToLinearStage(MakePerChannelOp(OpPq()));
+  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+    return MakeToLinearStage(OpHlg(output_encoding_info.luminances,
+                                   output_encoding_info.orig_intensity_target));
+  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+    return MakeToLinearStage(MakePerChannelOp(Op709()));
+  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
+             output_encoding_info.color_encoding.tf.IsDCI()) {
+    return MakeToLinearStage(
+        MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma}));
+  } else {
+    return jxl::make_unique<ToLinearStage<OpInvalid>>();
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetToLinearStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
new file mode 100644
index 0000000000..ccee7b09f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from `output_encoding_info.color_encoding` to
+// linear.
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
new file mode 100644
index 0000000000..7609534a5b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
@@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_tone_mapping.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_tone_mapping.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class ToneMappingStage : public RenderPipelineStage {
+ public:
+  explicit ToneMappingStage(OutputEncodingInfo output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        output_encoding_info_(std::move(output_encoding_info)) {
+    if (output_encoding_info_.desired_intensity_target ==
+        output_encoding_info_.orig_intensity_target) {
+      // No tone mapping requested.
+      return;
+    }
+    if (output_encoding_info_.orig_color_encoding.tf.IsPQ() &&
+        output_encoding_info_.desired_intensity_target <
+            output_encoding_info_.orig_intensity_target) {
+      tone_mapper_ = jxl::make_unique<ToneMapper>(
+          /*source_range=*/std::pair<float, float>(
+              0, output_encoding_info_.orig_intensity_target),
+          /*target_range=*/
+          std::pair<float, float>(
+              0, output_encoding_info_.desired_intensity_target),
+          output_encoding_info_.luminances);
+    } else if (output_encoding_info_.orig_color_encoding.tf.IsHLG() &&
+               !output_encoding_info_.color_encoding.tf.IsHLG()) {
+      hlg_ootf_ = jxl::make_unique<HlgOOTF>(
+          /*source_luminance=*/output_encoding_info_.orig_intensity_target,
+          /*target_luminance=*/output_encoding_info_.desired_intensity_target,
+          output_encoding_info_.luminances);
+    }
+
+    if (output_encoding_info_.color_encoding.tf.IsPQ() &&
+        (tone_mapper_ || hlg_ootf_)) {
+      to_intensity_target_ =
+          10000.f / output_encoding_info_.orig_intensity_target;
+      from_desired_intensity_target_ =
+          output_encoding_info_.desired_intensity_target / 10000.f;
+    }
+  }
+
+  bool IsNeeded() const { return tone_mapper_ || hlg_ootf_; }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("ToneMapping");
+
+    if (!(tone_mapper_ || hlg_ootf_)) return;
+
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      if (tone_mapper_ || hlg_ootf_) {
+        r = Mul(r, Set(d, to_intensity_target_));
+        g = Mul(g, Set(d, to_intensity_target_));
+        b = Mul(b, Set(d, to_intensity_target_));
+        if (tone_mapper_) {
+          tone_mapper_->ToneMap(&r, &g, &b);
+        } else {
+          JXL_ASSERT(hlg_ootf_);
+          hlg_ootf_->Apply(&r, &g, &b);
+        }
+        if (tone_mapper_ || hlg_ootf_->WarrantsGamutMapping()) {
+          GamutMap(&r, &g, &b, output_encoding_info_.luminances);
+        }
+        r = Mul(r, Set(d, from_desired_intensity_target_));
+        g = Mul(g, Set(d, from_desired_intensity_target_));
+        b = Mul(b, Set(d, from_desired_intensity_target_));
+      }
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ToneMapping"; }
+
+ private:
+  using ToneMapper = Rec2408ToneMapper<HWY_FULL(float)>;
+  OutputEncodingInfo output_encoding_info_;
+  std::unique_ptr<ToneMapper> tone_mapper_;
+  std::unique_ptr<HlgOOTF> hlg_ootf_;
+  // When the target colorspace is PQ, 1 represents 10000 nits instead of
+  // orig_intensity_target. This temporarily changes this if the tone mappers
+  // require it.
+  float to_intensity_target_ = 1.f;
+  float from_desired_intensity_target_ = 1.f;
+};
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  auto stage = jxl::make_unique<ToneMappingStage>(output_encoding_info);
+  if (!stage->IsNeeded()) return nullptr;
+  return stage;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToneMappingStage);
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetToneMappingStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
new file mode 100644
index 0000000000..99824f8511
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Tone maps the image if appropriate. It must be in linear space and
+// `output_encoding_info.luminances` must contain the luminance for the
+// primaries of that space. It must also be encoded such that (1, 1, 1)
+// represents `output_encoding_info.orig_intensity_target` nits, unless
+// `output_encoding_info.color_encoding.tf.IsPQ()`, in which case (1, 1, 1) must
+// represent 10000 nits. This corresponds to what XYBStage outputs. After this
+// stage, (1, 1, 1) will represent
+// `output_encoding_info.desired_intensity_target` nits, except in the PQ
+// special case in which it remains 10000.
+//
+// If no tone mapping is necessary, this will return nullptr.
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
new file mode 100644
index 0000000000..a75e259865
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
@@ -0,0 +1,187 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class UpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c,
+                           size_t shift)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/shift, /*border=*/2)),
+        c_(c) {
+    const float* weights = shift == 1   ? ups_factors.upsampling2_weights
+                           : shift == 2 ? ups_factors.upsampling4_weights
+                                        : ups_factors.upsampling8_weights;
+    size_t N = 1 << (shift - 1);
+    for (size_t i = 0; i < 5 * N; i++) {
+      for (size_t j = 0; j < 5 * N; j++) {
+        size_t y = std::min(i, j);
+        size_t x = std::max(i, j);
+        kernel_[j / 5][i / 5][j % 5][i % 5] =
+            weights[5 * N * y - y * (y - 1) / 2 + x - y];
+      }
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Upsampling");
+    static HWY_FULL(float) df;
+    size_t shift = settings_.shift_x;
+    size_t N = 1 << shift;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(df));
+    for (ssize_t iy = -2; iy <= 2; iy++) {
+      msan::UnpoisonMemory(GetInputRow(input_rows, c_, iy) + xsize + 2,
+                           sizeof(float) * (xsize_v - xsize));
+    }
+    JXL_ASSERT(xextra == 0);
+    ssize_t x0 = 0;
+    ssize_t x1 = xsize;
+    if (N == 2) {
+      ProcessRowImpl<2>(input_rows, output_rows, x0, x1);
+    }
+    if (N == 4) {
+      ProcessRowImpl<4>(input_rows, output_rows, x0, x1);
+    }
+    if (N == 8) {
+      ProcessRowImpl<8>(input_rows, output_rows, x0, x1);
+    }
+    for (size_t oy = 0; oy < N; oy++) {
+      float* dst_row = GetOutputRow(output_rows, c_, oy);
+      msan::PoisonMemory(dst_row + xsize * N,
+                         sizeof(float) * (xsize_v - xsize) * N);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Upsample"; }
+
+ private:
+  template <size_t N>
+  JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const {
+    ix += 2;
+    iy += 2;
+    if (N == 2) {
+      return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix];
+    }
+    if (N == 4) {
+      return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2]
+                    [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy]
+                    [x % 4 < 2 ? ix : 4 - ix];
+    }
+    if (N == 8) {
+      return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4]
+                    [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy]
+                    [x % 8 < 4 ? ix : 4 - ix];
+    }
+    JXL_ABORT("Invalid upsample");
+  }
+
+  template <ssize_t N>
+  void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows,
+                      ssize_t x0, ssize_t x1) const {
+    static HWY_FULL(float) df;
+    using V = hwy::HWY_NAMESPACE::Vec<HWY_FULL(float)>;
+    V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7;
+    (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7;
+    V* ups[N];
+    if (N >= 2) {
+      ups[0] = &ups0;
+      ups[1] = &ups1;
+    }
+    if (N >= 4) {
+      ups[2] = &ups2;
+      ups[3] = &ups3;
+    }
+    if (N == 8) {
+      ups[4] = &ups4;
+      ups[5] = &ups5;
+      ups[6] = &ups6;
+      ups[7] = &ups7;
+    }
+    for (size_t oy = 0; oy < N; oy++) {
+      float* dst_row = GetOutputRow(output_rows, c_, oy);
+      for (ssize_t x = x0; x < x1; x += Lanes(df)) {
+        for (size_t ox = 0; ox < N; ox++) {
+          auto result = Zero(df);
+          auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x);
+          auto max = min;
+          for (ssize_t iy = -2; iy <= 2; iy++) {
+            for (ssize_t ix = -2; ix <= 2; ix++) {
+              auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix);
+              result = MulAdd(Set(df, Kernel<N>(ox, oy, ix, iy)), v, result);
+              min = Min(v, min);
+              max = Max(v, max);
+            }
+          }
+          // Avoid overshooting.
+          *ups[ox] = Clamp(result, min, max);
+        }
+        if (N == 2) {
+          StoreInterleaved(df, ups0, ups1, dst_row + x * N);
+        }
+        if (N == 4) {
+          StoreInterleaved(df, ups0, ups1, ups2, ups3, dst_row + x * N);
+        }
+        if (N == 8) {
+          StoreInterleaved(df, ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7,
+                           dst_row + x * N);
+        }
+      }
+    }
+  }
+
+  size_t c_;
+  float kernel_[4][4][5][5];
+};
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift) {
+  return jxl::make_unique<UpsamplingStage>(ups_factors, c, shift);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift) {
+  JXL_ASSERT(shift != 0);
+  JXL_ASSERT(shift <= 3);
+  return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
new file mode 100644
index 0000000000..7d5defd23c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Upsamples the given channel by the given factor.
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
new file mode 100644
index 0000000000..902fc33b7e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
@@ -0,0 +1,601 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_write.h"
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_write.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+
+class WriteToOutputStage : public RenderPipelineStage {
+ public:
+  WriteToOutputStage(const ImageOutput& main_output, size_t width,
+                     size_t height, bool has_alpha, bool unpremul_alpha,
+                     size_t alpha_c, Orientation undo_orientation,
+                     const std::vector<ImageOutput>& extra_output)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        width_(width),
+        height_(height),
+        main_(main_output),
+        num_color_(main_.num_channels_ < 3 ? 1 : 3),
+        want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4),
+        has_alpha_(has_alpha),
+        unpremul_alpha_(unpremul_alpha),
+        alpha_c_(alpha_c),
+        flip_x_(ShouldFlipX(undo_orientation)),
+        flip_y_(ShouldFlipY(undo_orientation)),
+        transpose_(ShouldTranspose(undo_orientation)),
+        opaque_alpha_(kMaxPixelsPerCall, 1.0f) {
+    for (size_t ec = 0; ec < extra_output.size(); ++ec) {
+      if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) {
+        Output extra(extra_output[ec]);
+        extra.channel_index_ = 3 + ec;
+        extra_channels_.push_back(extra);
+      }
+    }
+  }
+
+  WriteToOutputStage(const WriteToOutputStage&) = delete;
+  WriteToOutputStage& operator=(const WriteToOutputStage&) = delete;
+  WriteToOutputStage(WriteToOutputStage&&) = delete;
+  WriteToOutputStage& operator=(WriteToOutputStage&&) = delete;
+
+  ~WriteToOutputStage() override {
+    if (main_.run_opaque_) {
+      main_.pixel_callback_.destroy(main_.run_opaque_);
+    }
+    for (auto& extra : extra_channels_) {
+      if (extra.run_opaque_) {
+        extra.pixel_callback_.destroy(extra.run_opaque_);
+      }
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    JXL_DASSERT(xextra == 0);
+    JXL_DASSERT(main_.run_opaque_ || main_.buffer_);
+    if (ypos >= height_) return;
+    if (xpos >= width_) return;
+    if (flip_y_) {
+      ypos = height_ - 1u - ypos;
+    }
+    size_t limit = std::min(xsize, width_ - xpos);
+    for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) {
+      size_t xstart = xpos + x0;
+      size_t len = std::min<size_t>(kMaxPixelsPerCall, limit - x0);
+
+      const float* line_buffers[4];
+      for (size_t c = 0; c < num_color_; c++) {
+        line_buffers[c] = GetInputRow(input_rows, c, 0) + x0;
+      }
+      if (has_alpha_) {
+        line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0;
+      } else {
+        // opaque_alpha_ is a way to set all values to 1.0f.
+        line_buffers[num_color_] = opaque_alpha_.data();
+      }
+      if (has_alpha_ && want_alpha_ && unpremul_alpha_) {
+        UnpremulAlpha(thread_id, len, line_buffers);
+      }
+      OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers);
+      for (const auto& extra : extra_channels_) {
+        line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0;
+        OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    if (c < num_color_ || (has_alpha_ && c == alpha_c_)) {
+      return RenderPipelineChannelMode::kInput;
+    }
+    for (const auto& extra : extra_channels_) {
+      if (c == extra.channel_index_) {
+        return RenderPipelineChannelMode::kInput;
+      }
+    }
+    return RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "WritePixelCB"; }
+
+ private:
+  struct Output {
+    Output(const ImageOutput& image_out)
+        : pixel_callback_(image_out.callback),
+          buffer_(image_out.buffer),
+          buffer_size_(image_out.buffer_size),
+          stride_(image_out.stride),
+          num_channels_(image_out.format.num_channels),
+          swap_endianness_(SwapEndianness(image_out.format.endianness)),
+          data_type_(image_out.format.data_type),
+          bits_per_sample_(image_out.bits_per_sample) {}
+
+    Status PrepareForThreads(size_t num_threads) {
+      if (pixel_callback_.IsPresent()) {
+        run_opaque_ =
+            pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall);
+        JXL_RETURN_IF_ERROR(run_opaque_ != nullptr);
+      } else {
+        JXL_RETURN_IF_ERROR(buffer_ != nullptr);
+      }
+      return true;
+    }
+
+    PixelCallback pixel_callback_;
+    void* run_opaque_ = nullptr;
+    void* buffer_ = nullptr;
+    size_t buffer_size_;
+    size_t stride_;
+    size_t num_channels_;
+    bool swap_endianness_;
+    JxlDataType data_type_;
+    size_t bits_per_sample_;
+    size_t channel_index_;  // used for extra_channels
+  };
+
+  Status PrepareForThreads(size_t num_threads) override {
+    JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads));
+    for (auto& extra : extra_channels_) {
+      JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads));
+    }
+    temp_out_.resize(num_threads);
+    for (CacheAlignedUniquePtr& temp : temp_out_) {
+      temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall *
+                           main_.num_channels_);
+    }
+    if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) {
+      temp_in_.resize(num_threads * main_.num_channels_);
+      for (CacheAlignedUniquePtr& temp : temp_in_) {
+        temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall);
+      }
+    }
+    return true;
+  }
+  static bool ShouldFlipX(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipHorizontal ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldFlipY(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipVertical ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldTranspose(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kTranspose ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+
+  void UnpremulAlpha(size_t thread_id, size_t len,
+                     const float** line_buffers) const {
+    const HWY_FULL(float) d;
+    auto one = Set(d, 1.0f);
+    float* temp_in[4];
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+    }
+    auto small_alpha = Set(d, kSmallAlpha);
+    for (size_t ix = 0; ix < len; ix += Lanes(d)) {
+      auto alpha = LoadU(d, temp_in[num_color_] + ix);
+      auto mul = Div(one, Max(small_alpha, alpha));
+      for (size_t c = 0; c < num_color_; ++c) {
+        auto val = LoadU(d, temp_in[c] + ix);
+        StoreU(Mul(val, mul), d, temp_in[c] + ix);
+      }
+    }
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+  }
+
+  void OutputBuffers(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, const float* input[4]) const {
+    if (flip_x_) {
+      FlipX(out, thread_id, len, &xstart, input);
+    }
+    if (out.data_type_ == JXL_TYPE_UINT8) {
+      uint8_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint8_t*>(temp_out_[thread_id].get());
+      StoreUnsignedRow(out, input, len, temp);
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_UINT16 ||
+               out.data_type_ == JXL_TYPE_FLOAT16) {
+      uint16_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint16_t*>(temp_out_[thread_id].get());
+      if (out.data_type_ == JXL_TYPE_UINT16) {
+        StoreUnsignedRow(out, input, len, temp);
+      } else {
+        StoreFloat16Row(out, input, len, temp);
+      }
+      if (out.swap_endianness_) {
+        const HWY_FULL(uint16_t) du;
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; j += Lanes(du)) {
+          auto v = LoadU(du, temp + j);
+          auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+          StoreU(vswap, du, temp + j);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_FLOAT) {
+      float* JXL_RESTRICT temp =
+          reinterpret_cast<float*>(temp_out_[thread_id].get());
+      StoreFloatRow(out, input, len, temp);
+      if (out.swap_endianness_) {
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; ++j) {
+          temp[j] = BSwapFloat(temp[j]);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    }
+  }
+
+  void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart,
+             const float** line_buffers) const {
+    float* temp_in[4];
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      if (temp_in[c] != line_buffers[c]) {
+        memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+      }
+    }
+    size_t last = (len - 1u);
+    size_t num = (len / 2);
+    for (size_t i = 0; i < num; ++i) {
+      for (size_t c = 0; c < out.num_channels_; ++c) {
+        std::swap(temp_in[c][i], temp_in[c][last - i]);
+      }
+    }
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+    *xstart = width_ - *xstart - len;
+  }
+
+  template <typename T>
+  void StoreUnsignedRow(const Output& out, const float* input[4], size_t len,
+                        T* output) const {
+    const HWY_FULL(float) d;
+    auto zero = Zero(d);
+    auto one = Set(d, 1.0f);
+    auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1);
+    const Rebind<T, decltype(d)> du;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        auto v3 = Mul(Clamp(zero, LoadU(d, &input[3][i]), one), mul);
+        StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)),
+                          DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloat16Row(const Output& out, const float* input[4], size_t len,
+                       uint16_t* output) const {
+    const HWY_FULL(float) d;
+    const Rebind<uint16_t, decltype(d)> du;
+    const Rebind<hwy::float16_t, decltype(d)> df16;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        auto v3 = LoadU(d, &input[3][i]);
+        StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)),
+                          BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloatRow(const Output& out, const float* input[4], size_t len,
+                     float* output) const {
+    const HWY_FULL(float) d;
+    if (out.num_channels_ == 1) {
+      memcpy(output, input[0], len * sizeof(output[0]));
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d,
+                          &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), d, &output[3 * i]);
+      }
+    } else {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d,
+                          &output[4 * i]);
+      }
+    }
+  }
+
+  template <typename T>
+  void WriteToOutput(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, T* output) const {
+    if (transpose_) {
+      // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD.
+      if (out.run_opaque_) {
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i,
+                                  1, output + j);
+        }
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = xstart * out.stride_ + ypos * pixel_stride;
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          const size_t ix = offset + i * out.stride_;
+          JXL_DASSERT(ix + pixel_stride <= out.buffer_size_);
+          memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + ix, output + j,
+                 pixel_stride);
+        }
+      }
+    } else {
+      if (out.run_opaque_) {
+        out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len,
+                                output);
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = ypos * out.stride_ + xstart * pixel_stride;
+        JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_);
+        memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + offset, output,
+               len * pixel_stride);
+      }
+    }
+  }
+
+  static constexpr size_t kMaxPixelsPerCall = 1024;
+  size_t width_;
+  size_t height_;
+  Output main_;  // color + alpha
+  size_t num_color_;
+  bool want_alpha_;
+  bool has_alpha_;
+  bool unpremul_alpha_;
+  size_t alpha_c_;
+  bool flip_x_;
+  bool flip_y_;
+  bool transpose_;
+  std::vector<Output> extra_channels_;
+  std::vector<float> opaque_alpha_;
+  std::vector<CacheAlignedUniquePtr> temp_in_;
+  std::vector<CacheAlignedUniquePtr> temp_out_;
+};
+
+constexpr size_t WriteToOutputStage::kMaxPixelsPerCall;
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return jxl::make_unique<WriteToOutputStage>(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+
+HWY_EXPORT(GetWriteToOutputStage);
+
+namespace {
+class WriteToImageBundleStage : public RenderPipelineStage {
+ public:
+  explicit WriteToImageBundleStage(ImageBundle* image_bundle,
+                                   ColorEncoding color_encoding)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        image_bundle_(image_bundle),
+        color_encoding_(std::move(color_encoding)) {}
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < input_sizes.size(); c++) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    // TODO(eustas): what should we do in the case of "want only ECs"?
+    image_bundle_->SetFromImage(
+        Image3F(input_sizes[0].first, input_sizes[0].second), color_encoding_);
+    // TODO(veluca): consider not reallocating ECs if not needed.
+    image_bundle_->extra_channels().clear();
+    for (size_t c = 3; c < input_sizes.size(); c++) {
+      image_bundle_->extra_channels().emplace_back(input_sizes[c].first,
+                                                   input_sizes[c].second);
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(image_bundle_->color()->PlaneRow(c, ypos) + xpos - xextra,
+             GetInputRow(input_rows, c, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+    for (size_t ec = 0; ec < image_bundle_->extra_channels().size(); ec++) {
+      JXL_ASSERT(image_bundle_->extra_channels()[ec].xsize() >=
+                 xpos + xsize + xextra);
+      memcpy(image_bundle_->extra_channels()[ec].Row(ypos) + xpos - xextra,
+             GetInputRow(input_rows, 3 + ec, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInput;
+  }
+
+  const char* GetName() const override { return "WriteIB"; }
+
+ private:
+  ImageBundle* image_bundle_;
+  ColorEncoding color_encoding_;
+};
+
+class WriteToImage3FStage : public RenderPipelineStage {
+ public:
+  explicit WriteToImage3FStage(Image3F* image)
+      : RenderPipelineStage(RenderPipelineStage::Settings()), image_(image) {}
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < 3; ++c) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    *image_ = Image3F(input_sizes[0].first, input_sizes[0].second);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(image_->PlaneRow(c, ypos) + xpos - xextra,
+             GetInputRow(input_rows, c, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInput
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "WriteI3F"; }
+
+ private:
+  Image3F* image_;
+};
+
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+    ImageBundle* image_bundle, ColorEncoding color_encoding) {
+  return jxl::make_unique<WriteToImageBundleStage>(image_bundle,
+                                                   std::move(color_encoding));
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image) {
+  return jxl::make_unique<WriteToImage3FStage>(image);
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
new file mode 100644
index 0000000000..c5f844ebe8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+
+#include <functional>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+    ImageBundle* image_bundle, ColorEncoding color_encoding);
+
+// Gets a stage to write color channels to an Image3F.
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image);
+
+// Gets a stage to write to a pixel callback or image buffer.
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
new file mode 100644
index 0000000000..15cfc75b18
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
@@ -0,0 +1,176 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_xyb.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class XYBStage : public RenderPipelineStage {
+ public:
+  explicit XYBStage(const OutputEncodingInfo& output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        opsin_params_(output_encoding_info.opsin_params),
+        output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() ==
+                       ColorSpace::kXYB) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("UndoXYB");
+
+    const HWY_FULL(float) d;
+    JXL_ASSERT(xextra == 0);
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    // TODO(eustas): when using frame origin, addresses might be unaligned;
+    //               making them aligned will void performance penalty.
+    if (output_is_xyb_) {
+      const auto scale_x = Set(d, kScaledXYBScale[0]);
+      const auto scale_y = Set(d, kScaledXYBScale[1]);
+      const auto scale_bmy = Set(d, kScaledXYBScale[2]);
+      const auto offset_x = Set(d, kScaledXYBOffset[0]);
+      const auto offset_y = Set(d, kScaledXYBOffset[1]);
+      const auto offset_bmy = Set(d, kScaledXYBOffset[2]);
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_x = LoadU(d, row0 + x);
+        const auto in_y = LoadU(d, row1 + x);
+        const auto in_b = LoadU(d, row2 + x);
+        auto out_x = Mul(Add(in_x, offset_x), scale_x);
+        auto out_y = Mul(Add(in_y, offset_y), scale_y);
+        auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy);
+        StoreU(out_x, d, row0 + x);
+        StoreU(out_y, d, row1 + x);
+        StoreU(out_b, d, row2 + x);
+      }
+    } else {
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_opsin_x = LoadU(d, row0 + x);
+        const auto in_opsin_y = LoadU(d, row1 + x);
+        const auto in_opsin_b = LoadU(d, row2 + x);
+        auto r = Undefined(d);
+        auto g = Undefined(d);
+        auto b = Undefined(d);
+        XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g,
+                 &b);
+        StoreU(r, d, row0 + x);
+        StoreU(g, d, row1 + x);
+        StoreU(b, d, row2 + x);
+      }
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "XYB"; }
+
+ private:
+  const OpsinParams opsin_params_;
+  const bool output_is_xyb_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return jxl::make_unique<XYBStage>(output_encoding_info);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetXYBStage);
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info);
+}
+
+namespace {
+class FastXYBStage : public RenderPipelineStage {
+ public:
+  FastXYBStage(uint8_t* rgb, size_t stride, size_t width, size_t height,
+               bool rgba, bool has_alpha, size_t alpha_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        rgb_(rgb),
+        stride_(stride),
+        width_(width),
+        height_(height),
+        rgba_(rgba),
+        has_alpha_(has_alpha),
+        alpha_c_(alpha_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    if (ypos >= height_) return;
+    JXL_ASSERT(xextra == 0);
+    const float* xyba[4] = {
+        GetInputRow(input_rows, 0, 0), GetInputRow(input_rows, 1, 0),
+        GetInputRow(input_rows, 2, 0),
+        has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr};
+    uint8_t* out_buf = rgb_ + stride_ * ypos + (rgba_ ? 4 : 3) * xpos;
+    FastXYBTosRGB8(xyba, out_buf, rgba_,
+                   xsize + xpos <= width_ ? xsize : width_ - xpos);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 || (has_alpha_ && c == alpha_c_)
+               ? RenderPipelineChannelMode::kInput
+               : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "FastXYB"; }
+
+ private:
+  uint8_t* rgb_;
+  size_t stride_;
+  size_t width_;
+  size_t height_;
+  bool rgba_;
+  bool has_alpha_;
+  size_t alpha_c_;
+  std::vector<float> opaque_alpha_;
+};
+
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+    uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+    bool has_alpha, size_t alpha_c) {
+  JXL_ASSERT(HasFastXYBTosRGB8());
+  return make_unique<FastXYBStage>(rgb, stride, width, height, rgba, has_alpha,
+                                   alpha_c);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
new file mode 100644
index 0000000000..7b06345c36
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#include <stdint.h>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from XYB to linear with appropriate primaries.
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+// Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and
+// write to a uint8 buffer.
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+    uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+    bool has_alpha, size_t alpha_c);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
new file mode 100644
index 0000000000..5cba4a7d41
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
@@ -0,0 +1,85 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_ycbcr.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class kYCbCrStage : public RenderPipelineStage {
+ public:
+  kYCbCrStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("UndoYCbCr");
+
+    const HWY_FULL(float) df;
+
+    // Full-range BT.601 as defined by JFIF Clause 7:
+    // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+    const auto c128 = Set(df, 128.0f / 255);
+    const auto crcr = Set(df, 1.402f);
+    const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+    const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+    const auto cbcb = Set(df, 1.772f);
+
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // TODO(eustas): when using frame origin, addresses might be unaligned;
+    //               making them aligned will void performance penalty.
+    for (size_t x = 0; x < xsize; x += Lanes(df)) {
+      const auto y_vec = Add(LoadU(df, row1 + x), c128);
+      const auto cb_vec = LoadU(df, row0 + x);
+      const auto cr_vec = LoadU(df, row2 + x);
+      const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+      const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+      const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+      StoreU(r_vec, df, row0 + x);
+      StoreU(g_vec, df, row1 + x);
+      StoreU(b_vec, df, row2 + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "YCbCr"; }
+};
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+  return jxl::make_unique<kYCbCrStage>();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetYCbCrStage);
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+  return HWY_DYNAMIC_DISPATCH(GetYCbCrStage)();
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
new file mode 100644
index 0000000000..9320c9723f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from YCbCr to RGB.
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage();
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
new file mode 100644
index 0000000000..789a52f8b2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+class UpsampleXSlowStage : public RenderPipelineStage {
+ public:
+  UpsampleXSlowStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(1, 1)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      const float* row = GetInputRow(input_rows, c, 0);
+      float* row_out = GetOutputRow(output_rows, c, 0);
+      for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+        float xp = *(row + x - 1);
+        float xc = *(row + x);
+        float xn = *(row + x + 1);
+        float xout0 = xp * 0.25f + xc * 0.75f;
+        float xout1 = xc * 0.75f + xn * 0.25f;
+        *(row_out + 2 * x + 0) = xout0;
+        *(row_out + 2 * x + 1) = xout1;
+      }
+    }
+  }
+
+  const char* GetName() const override { return "TEST::UpsampleXSlowStage"; }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInOut;
+  }
+};
+
+class UpsampleYSlowStage : public RenderPipelineStage {
+ public:
+  UpsampleYSlowStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(1, 1)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      const float* rowp = GetInputRow(input_rows, c, -1);
+      const float* rowc = GetInputRow(input_rows, c, 0);
+      const float* rown = GetInputRow(input_rows, c, 1);
+      float* row_out0 = GetOutputRow(output_rows, c, 0);
+      float* row_out1 = GetOutputRow(output_rows, c, 1);
+      for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+        float xp = *(rowp + x);
+        float xc = *(rowc + x);
+        float xn = *(rown + x);
+        float yout0 = xp * 0.25f + xc * 0.75f;
+        float yout1 = xc * 0.75f + xn * 0.25f;
+        *(row_out0 + x) = yout0;
+        *(row_out1 + x) = yout1;
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInOut;
+  }
+
+  const char* GetName() const override { return "TEST::UpsampleYSlowStage"; }
+};
+
+class Check0FinalStage : public RenderPipelineStage {
+ public:
+  Check0FinalStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      for (size_t x = 0; x < xsize; x++) {
+        JXL_CHECK(fabsf(GetInputRow(input_rows, c, 0)[x]) < 1e-8);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInput;
+  }
+  const char* GetName() const override { return "TEST::Check0FinalStage"; }
+};
+
+}  // namespace jxl
-- 
cgit v1.2.3