summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc')
-rw-r--r--third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc865
1 files changed, 865 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
new file mode 100644
index 0000000000..db60a458db
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
@@ -0,0 +1,865 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+
+#include <algorithm>
+#include <queue>
+#include <tuple>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+std::pair<size_t, size_t>
+LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions(
+ std::pair<size_t, size_t> in, size_t c, size_t stage) const {
+ std::pair<size_t, size_t> ret;
+ std::pair<size_t, size_t> shift = channel_shifts_[stage][c];
+ ret.first =
+ ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first;
+ ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >>
+ shift.second;
+ return ret;
+}
+
+std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore(
+ size_t c) const {
+ auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0);
+ ret.first += padding_[0][c].first;
+ ret.second += padding_[0][c].second;
+ return ret;
+}
+
+void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c,
+ const ImageF& in) {
+ size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t gx = group_id % frame_dimensions_.xsize_groups;
+ size_t hshift = channel_shifts_[0][c].first;
+ size_t vshift = channel_shifts_[0][c].second;
+ size_t x0 = gx * GroupInputXSize(c);
+ size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+ DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+ size_t y0 = gy * GroupInputYSize(c);
+ size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+ DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+ auto borders = BorderToStore(c);
+ size_t borderx_write = borders.first;
+ size_t bordery_write = borders.second;
+
+ if (gy > 0) {
+ Rect from(group_data_x_border_, group_data_y_border_, x1 - x0,
+ bordery_write);
+ Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write);
+ CopyImageTo(from, in, to, &borders_horizontal_[c]);
+ }
+ if (gy + 1 < frame_dimensions_.ysize_groups) {
+ Rect from(group_data_x_border_,
+ group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0,
+ bordery_write);
+ Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write);
+ CopyImageTo(from, in, to, &borders_horizontal_[c]);
+ }
+ if (gx > 0) {
+ Rect from(group_data_x_border_, group_data_y_border_, borderx_write,
+ y1 - y0);
+ Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0);
+ CopyImageTo(from, in, to, &borders_vertical_[c]);
+ }
+ if (gx + 1 < frame_dimensions_.xsize_groups) {
+ Rect from(group_data_x_border_ + x1 - x0 - borderx_write,
+ group_data_y_border_, borderx_write, y1 - y0);
+ Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0);
+ CopyImageTo(from, in, to, &borders_vertical_[c]);
+ }
+}
+
+void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c,
+ const Rect& r, ImageF* out) {
+ size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t gx = group_id % frame_dimensions_.xsize_groups;
+ size_t hshift = channel_shifts_[0][c].first;
+ size_t vshift = channel_shifts_[0][c].second;
+ // Coordinates of the group in the image.
+ size_t x0 = gx * GroupInputXSize(c);
+ size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+ DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+ size_t y0 = gy * GroupInputYSize(c);
+ size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+ DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+ size_t paddingx = padding_[0][c].first;
+ size_t paddingy = padding_[0][c].second;
+
+ auto borders = BorderToStore(c);
+ size_t borderx_write = borders.first;
+ size_t bordery_write = borders.second;
+
+ // Limits of the area to copy from, in image coordinates.
+ JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx);
+ size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift);
+ if (x0src != 0) {
+ x0src -= paddingx;
+ }
+ // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the
+ // right side of the image, so we use min() here.
+ size_t x1src =
+ DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift);
+ x1src = std::min(x1src + paddingx,
+ DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+
+ // Similar computation for y.
+ JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy);
+ size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift);
+ if (y0src != 0) {
+ y0src -= paddingy;
+ }
+ size_t y1src =
+ DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift);
+ y1src = std::min(y1src + paddingy,
+ DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+ // Copy other groups' borders from the border storage.
+ if (y0src < y0) {
+ JXL_DASSERT(gy > 0);
+ CopyImageTo(
+ Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write),
+ borders_horizontal_[c],
+ Rect(group_data_x_border_ + x0src - x0,
+ group_data_y_border_ - bordery_write, x1src - x0src,
+ bordery_write),
+ out);
+ }
+ if (y1src > y1) {
+ // When copying the bottom border we must not be on the bottom groups.
+ JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups);
+ CopyImageTo(
+ Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write),
+ borders_horizontal_[c],
+ Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0,
+ x1src - x0src, bordery_write),
+ out);
+ }
+ if (x0src < x0) {
+ JXL_DASSERT(gx > 0);
+ CopyImageTo(
+ Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src),
+ borders_vertical_[c],
+ Rect(group_data_x_border_ - borderx_write,
+ group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src),
+ out);
+ }
+ if (x1src > x1) {
+ // When copying the right border we must not be on the rightmost groups.
+ JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups);
+ CopyImageTo(
+ Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src),
+ borders_vertical_[c],
+ Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0,
+ borderx_write, y1src - y0src),
+ out);
+ }
+}
+
+size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const {
+ return (frame_dimensions_.group_dim << base_color_shift_) >>
+ channel_shifts_[0][c].first;
+}
+
+size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const {
+ return (frame_dimensions_.group_dim << base_color_shift_) >>
+ channel_shifts_[0][c].second;
+}
+
+void LowMemoryRenderPipeline::EnsureBordersStorage() {
+ const auto& shifts = channel_shifts_[0];
+ if (borders_horizontal_.size() < shifts.size()) {
+ borders_horizontal_.resize(shifts.size());
+ borders_vertical_.resize(shifts.size());
+ }
+ for (size_t c = 0; c < shifts.size(); c++) {
+ auto borders = BorderToStore(c);
+ size_t borderx = borders.first;
+ size_t bordery = borders.second;
+ JXL_DASSERT(frame_dimensions_.xsize_groups > 0);
+ size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2;
+ JXL_DASSERT(frame_dimensions_.ysize_groups > 0);
+ size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2;
+ size_t downsampled_xsize =
+ DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first);
+ size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded,
+ 1 << shifts[c].second);
+ Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders);
+ if (!SameSize(horizontal, borders_horizontal_[c])) {
+ borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize());
+ }
+ Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize);
+ if (!SameSize(vertical, borders_vertical_[c])) {
+ borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize());
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::Init() {
+ group_border_ = {0, 0};
+ base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+ frame_dimensions_.xsize_padded);
+
+ const auto& shifts = channel_shifts_[0];
+
+ // Ensure that each channel has enough many border pixels.
+ for (size_t c = 0; c < shifts.size(); c++) {
+ group_border_.first =
+ std::max(group_border_.first,
+ DivCeil(padding_[0][c].first << channel_shifts_[0][c].first,
+ 1 << base_color_shift_));
+ group_border_.second =
+ std::max(group_border_.second,
+ DivCeil(padding_[0][c].second << channel_shifts_[0][c].second,
+ 1 << base_color_shift_));
+ }
+
+ // Ensure that all channels have an integer number of border pixels in the
+ // input.
+ for (size_t c = 0; c < shifts.size(); c++) {
+ if (channel_shifts_[0][c].first >= base_color_shift_) {
+ group_border_.first =
+ RoundUpTo(group_border_.first,
+ 1 << (channel_shifts_[0][c].first - base_color_shift_));
+ }
+ if (channel_shifts_[0][c].second >= base_color_shift_) {
+ group_border_.second =
+ RoundUpTo(group_border_.second,
+ 1 << (channel_shifts_[0][c].second - base_color_shift_));
+ }
+ }
+ // Ensure that the X border on color channels is a multiple of kBlockDim or
+ // the vector size (required for EPF stages). Vectors on ARM NEON are never
+ // wider than 4 floats, so rounding to multiples of 4 is enough.
+#if JXL_ARCH_ARM
+ constexpr size_t kGroupXAlign = 4;
+#else
+ constexpr size_t kGroupXAlign = 16;
+#endif
+ group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign);
+ // Allocate borders in group images that are just enough for storing the
+ // borders to be copied in, plus any rounding to ensure alignment.
+ std::pair<size_t, size_t> max_border = {0, 0};
+ for (size_t c = 0; c < shifts.size(); c++) {
+ max_border.first = std::max(BorderToStore(c).first, max_border.first);
+ max_border.second = std::max(BorderToStore(c).second, max_border.second);
+ }
+ group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign);
+ group_data_y_border_ = max_border.second;
+
+ EnsureBordersStorage();
+ group_border_assigner_.Init(frame_dimensions_);
+
+ for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0;
+ first_trailing_stage_--) {
+ bool has_inout_c = false;
+ for (size_t c = 0; c < shifts.size(); c++) {
+ if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) ==
+ RenderPipelineChannelMode::kInOut) {
+ has_inout_c = true;
+ }
+ }
+ if (has_inout_c) {
+ break;
+ }
+ }
+
+ first_image_dim_stage_ = stages_.size();
+ for (size_t i = 0; i < stages_.size(); i++) {
+ std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ input_sizes[c] =
+ std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[i][c].first),
+ DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[i][c].second));
+ }
+ stages_[i]->SetInputSizes(input_sizes);
+ if (stages_[i]->SwitchToImageDimensions()) {
+ // We don't allow kInOut after switching to image dimensions.
+ JXL_ASSERT(i >= first_trailing_stage_);
+ first_image_dim_stage_ = i + 1;
+ stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_,
+ &frame_origin_);
+ break;
+ }
+ }
+ for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+ if (stages_[i]->SwitchToImageDimensions()) {
+ JXL_ABORT("Cannot switch to image dimensions multiple times");
+ }
+ std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ input_sizes[c] = {full_image_xsize_, full_image_ysize_};
+ }
+ stages_[i]->SetInputSizes(input_sizes);
+ }
+
+ anyc_.resize(stages_.size());
+ for (size_t i = 0; i < stages_.size(); i++) {
+ for (size_t c = 0; c < shifts.size(); c++) {
+ if (stages_[i]->GetChannelMode(c) !=
+ RenderPipelineChannelMode::kIgnored) {
+ anyc_[i] = c;
+ }
+ }
+ }
+
+ stage_input_for_channel_ = std::vector<std::vector<int32_t>>(
+ stages_.size(), std::vector<int32_t>(shifts.size()));
+ for (size_t c = 0; c < shifts.size(); c++) {
+ int input = -1;
+ for (size_t i = 0; i < stages_.size(); i++) {
+ stage_input_for_channel_[i][c] = input;
+ if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ input = i;
+ }
+ }
+ }
+
+ image_rect_.resize(stages_.size());
+ for (size_t i = 0; i < stages_.size(); i++) {
+ size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[i][anyc_[i]].first);
+ size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[i][anyc_[i]].second);
+ image_rect_[i] = Rect(0, 0, x1, y1);
+ }
+
+ virtual_ypadding_for_output_.resize(stages_.size());
+ xpadding_for_output_.resize(stages_.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ int ypad = 0;
+ int xpad = 0;
+ for (size_t i = stages_.size(); i-- > 0;) {
+ if (stages_[i]->GetChannelMode(c) !=
+ RenderPipelineChannelMode::kIgnored) {
+ virtual_ypadding_for_output_[i] =
+ std::max(ypad, virtual_ypadding_for_output_[i]);
+ xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]);
+ }
+ if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) +
+ stages_[i]->settings_.border_y)
+ << channel_shifts_[i][c].second;
+ xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) +
+ stages_[i]->settings_.border_x;
+ }
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
+ bool use_group_ids) {
+ const auto& shifts = channel_shifts_[0];
+
+ use_group_ids_ = use_group_ids;
+ size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
+ for (size_t t = group_data_.size(); t < num_buffers; t++) {
+ group_data_.emplace_back();
+ group_data_[t].resize(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2,
+ GroupInputYSize(c) + group_data_y_border_ * 2);
+ }
+ }
+ // TODO(veluca): avoid reallocating buffers if not needed.
+ stage_data_.resize(num);
+ size_t upsampling = 1u << base_color_shift_;
+ size_t group_dim = frame_dimensions_.group_dim * upsampling;
+ size_t padding =
+ 2 * group_data_x_border_ * upsampling + // maximum size of a rect
+ 2 * kRenderPipelineXOffset; // extra padding for processing
+ size_t stage_buffer_xsize = group_dim + padding;
+ for (size_t t = 0; t < num; t++) {
+ stage_data_[t].resize(shifts.size());
+ for (size_t c = 0; c < shifts.size(); c++) {
+ stage_data_[t][c].resize(stages_.size());
+ size_t next_y_border = 0;
+ for (size_t i = stages_.size(); i-- > 0;) {
+ if (stages_[i]->GetChannelMode(c) ==
+ RenderPipelineChannelMode::kInOut) {
+ size_t stage_buffer_ysize =
+ 2 * next_y_border + (1 << stages_[i]->settings_.shift_y);
+ stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize);
+ next_y_border = stages_[i]->settings_.border_y;
+ stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize);
+ }
+ }
+ }
+ }
+ if (first_image_dim_stage_ != stages_.size()) {
+ RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+ frame_dimensions_.ysize_upsampled);
+ RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+ image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+ image_rect = image_rect.Intersection(full_image_rect);
+ if (image_rect.xsize() == 0 || image_rect.ysize() == 0) {
+ image_rect = RectT<ssize_t>(0, 0, 0, 0);
+ }
+ size_t left_padding = image_rect.x0();
+ size_t middle_padding = group_dim;
+ size_t right_padding = full_image_xsize_ - image_rect.x1();
+ size_t out_of_frame_xsize =
+ padding +
+ std::max(left_padding, std::max(middle_padding, right_padding));
+ out_of_frame_data_.resize(num);
+ for (size_t t = 0; t < num; t++) {
+ out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size());
+ }
+ }
+}
+
+std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers(
+ size_t group_id, size_t thread_id) {
+ std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size());
+ const size_t gx = group_id % frame_dimensions_.xsize_groups;
+ const size_t gy = group_id / frame_dimensions_.xsize_groups;
+ for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+ ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c];
+ ret[c].second = Rect(group_data_x_border_, group_data_y_border_,
+ GroupInputXSize(c), GroupInputYSize(c),
+ DivCeil(frame_dimensions_.xsize_upsampled,
+ 1 << channel_shifts_[0][c].first) -
+ gx * GroupInputXSize(c) + group_data_x_border_,
+ DivCeil(frame_dimensions_.ysize_upsampled,
+ 1 << channel_shifts_[0][c].second) -
+ gy * GroupInputYSize(c) + group_data_y_border_);
+ }
+ return ret;
+}
+
+namespace {
+
+JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) {
+ if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) {
+ return Mirror(y, image_ysize);
+ }
+ if (y + group_y0 >= image_ysize) {
+ // Here we know that the one mirroring step is sufficient.
+ return 2 * image_ysize - (y + group_y0) - 1 - group_y0;
+ }
+ return y;
+}
+
+JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0,
+ ssize_t group_xsize, ssize_t image_xsize) {
+ if (image_xsize <= borderx) {
+ if (group_x0 == 0) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset - ix - 1] =
+ row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)];
+ }
+ }
+ if (group_xsize + borderx + group_x0 >= image_xsize) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset + image_xsize + ix - group_x0] =
+ row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) -
+ group_x0];
+ }
+ }
+ } else {
+ // Here we know that the one mirroring step is sufficient.
+ if (group_x0 == 0) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix];
+ }
+ }
+ if (group_xsize + borderx + group_x0 >= image_xsize) {
+ for (ssize_t ix = 0; ix < borderx; ix++) {
+ row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] =
+ row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1];
+ }
+ }
+ }
+}
+
+// Information about where the *output* of each stage is stored.
+class Rows {
+ public:
+ Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages,
+ const Rect data_max_color_channel_rect, int group_data_x_border,
+ int group_data_y_border,
+ const std::vector<std::pair<size_t, size_t>>& group_data_shift,
+ size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data,
+ std::vector<ImageF>& input_data) {
+ size_t num_stages = stages.size();
+ size_t num_channels = input_data.size();
+
+ JXL_ASSERT(thread_data.size() == num_channels);
+ JXL_ASSERT(group_data_shift.size() == num_channels);
+
+#if JXL_ENABLE_ASSERT
+ for (const auto& td : thread_data) {
+ JXL_ASSERT(td.size() == num_stages);
+ }
+#endif
+
+ rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels));
+
+ for (size_t i = 0; i < num_stages; i++) {
+ for (size_t c = 0; c < input_data.size(); c++) {
+ if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+ rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1;
+ rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0);
+ rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow();
+ }
+ }
+ }
+
+ for (size_t c = 0; c < input_data.size(); c++) {
+ auto channel_group_data_rect =
+ data_max_color_channel_rect.As<ssize_t>()
+ .Translate(-group_data_x_border, -group_data_y_border)
+ .ShiftLeft(base_color_shift)
+ .CeilShiftRight(group_data_shift[c])
+ .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset),
+ group_data_y_border);
+ rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0);
+ rows_[0][c].stride = input_data[c].PixelsPerRow();
+ rows_[0][c].ymod_minus_1 = -1;
+ }
+ }
+
+ // Stage -1 refers to the input data; all other values must be nonnegative and
+ // refer to the data for the output of that stage.
+ JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const {
+ JXL_DASSERT(stage >= -1);
+ const RowInfo& info = rows_[stage + 1][c];
+ return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1);
+ }
+
+ private:
+ struct RowInfo {
+ // Pointer to beginning of the first row.
+ float* base_ptr;
+ // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of
+ // 2, which allows efficient mod computation by masking).
+ int ymod_minus_1;
+ // Number of floats per row.
+ size_t stride;
+ };
+ std::vector<std::vector<RowInfo>> rows_;
+};
+
+} // namespace
+
+void LowMemoryRenderPipeline::RenderRect(size_t thread_id,
+ std::vector<ImageF>& input_data,
+ Rect data_max_color_channel_rect,
+ Rect image_max_color_channel_rect) {
+ // For each stage, the rect corresponding to the image area currently being
+ // processed, in the coordinates of that stage (i.e. with the scaling factor
+ // that that stage has).
+ std::vector<Rect> group_rect;
+ group_rect.resize(stages_.size());
+ Rect image_area_rect =
+ image_max_color_channel_rect.ShiftLeft(base_color_shift_)
+ .Crop(frame_dimensions_.xsize_upsampled,
+ frame_dimensions_.ysize_upsampled);
+ for (size_t i = 0; i < stages_.size(); i++) {
+ group_rect[i] =
+ image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]);
+ }
+
+ ssize_t frame_x0 =
+ first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0;
+ ssize_t frame_y0 =
+ first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0;
+ size_t full_image_xsize = first_image_dim_stage_ == stages_.size()
+ ? frame_dimensions_.xsize_upsampled
+ : full_image_xsize_;
+ size_t full_image_ysize = first_image_dim_stage_ == stages_.size()
+ ? frame_dimensions_.ysize_upsampled
+ : full_image_ysize_;
+
+ // Compute actual x-axis bounds for the current image area in the context of
+ // the full image this frame is part of. As the left boundary may be negative,
+ // we also create the x_pixels_skip value, defined as follows:
+ // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0;
+ // - full_image_x0 - x_pixels_skip is the position of the current frame area
+ // in the full image.
+ ssize_t full_image_x0 = frame_x0 + image_area_rect.x0();
+ ssize_t x_pixels_skip = 0;
+ if (full_image_x0 < 0) {
+ x_pixels_skip = -full_image_x0;
+ full_image_x0 = 0;
+ }
+ ssize_t full_image_x1 = frame_x0 + image_area_rect.x1();
+ full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize);
+
+ // If the current image area is entirely outside of the visible image, there
+ // is no point in proceeding. Note: this uses the assumption that if there is
+ // a stage with observable effects (i.e. a kInput stage), it only appears
+ // after the stage that switches to image dimensions.
+ if (full_image_x1 <= full_image_x0) return;
+
+ // Data structures to hold information about input/output rows and their
+ // buffers.
+ Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_,
+ group_data_y_border_, channel_shifts_[0], base_color_shift_,
+ stage_data_[thread_id], input_data);
+
+ std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ +
+ 1);
+ for (size_t i = 0; i < first_trailing_stage_; i++) {
+ input_rows[i].resize(input_data.size());
+ }
+ input_rows[first_trailing_stage_].resize(input_data.size(),
+ std::vector<float*>(1));
+
+ // Maximum possible shift is 3.
+ RenderPipelineStage::RowInfo output_rows(input_data.size(),
+ std::vector<float*>(8));
+
+ // Fills in input_rows and output_rows for a given y value (relative to the
+ // start of the group, measured in actual pixels at the appropriate vertical
+ // scaling factor) and a given stage, applying mirroring if necessary. This
+ // function is somewhat inefficient for trailing kInOut or kInput stages,
+ // where just filling the input row once ought to be sufficient.
+ auto prepare_io_rows = [&](int y, size_t i) {
+ ssize_t bordery = stages_[i]->settings_.border_y;
+ size_t shifty = stages_[i]->settings_.shift_y;
+ auto make_row = [&](size_t c, ssize_t iy) {
+ size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(),
+ image_rect_[i].ysize());
+ input_rows[i][c][iy] =
+ rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c);
+ ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x,
+ group_rect[i].x0(), group_rect[i].xsize(),
+ image_rect_[i].xsize());
+ };
+ for (size_t c = 0; c < input_data.size(); c++) {
+ RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c);
+ if (mode == RenderPipelineChannelMode::kIgnored) {
+ continue;
+ }
+ // If we already have rows from a previous iteration, we can just shift
+ // the rows by 1 and insert the new one.
+ if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) {
+ for (ssize_t iy = 0; iy < 2 * bordery; iy++) {
+ input_rows[i][c][iy] = input_rows[i][c][iy + 1];
+ }
+ make_row(c, bordery * 2);
+ } else {
+ input_rows[i][c].resize(2 * bordery + 1);
+ for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) {
+ make_row(c, iy);
+ }
+ }
+
+ // If necessary, get the output buffers.
+ if (mode == RenderPipelineChannelMode::kInOut) {
+ for (size_t iy = 0; iy < (1u << shifty); iy++) {
+ output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c);
+ }
+ }
+ }
+ };
+
+ // We pretend that every stage has a vertical shift of 0, i.e. it is as tall
+ // as the final image.
+ // We call each such row a "virtual" row, because it may or may not correspond
+ // to an actual row of the current processing stage; actual processing happens
+ // when vy % (1<<vshift) == 0.
+
+ int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(),
+ virtual_ypadding_for_output_.end());
+
+ for (int vy = -num_extra_rows;
+ vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) {
+ for (size_t i = 0; i < first_trailing_stage_; i++) {
+ int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i];
+
+ if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) {
+ continue;
+ }
+
+ if (stage_vy < -virtual_ypadding_for_output_[i]) {
+ continue;
+ }
+
+ int y = stage_vy >> channel_shifts_[i][anyc_[i]].second;
+
+ ssize_t image_y = ssize_t(group_rect[i].y0()) + y;
+ // Do not produce rows in out-of-bounds areas.
+ if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) {
+ continue;
+ }
+
+ // Get the input/output rows and potentially apply mirroring to the input.
+ prepare_io_rows(y, i);
+
+ // Produce output rows.
+ stages_[i]->ProcessRow(input_rows[i], output_rows,
+ xpadding_for_output_[i], group_rect[i].xsize(),
+ group_rect[i].x0(), image_y, thread_id);
+ }
+
+ // Process trailing stages, i.e. the final set of non-kInOut stages; they
+ // all have the same input buffer and no need to use any mirroring.
+
+ int y = vy - num_extra_rows;
+
+ for (size_t c = 0; c < input_data.size(); c++) {
+ // Skip pixels that are not part of the actual final image area.
+ input_rows[first_trailing_stage_][c][0] =
+ rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y,
+ c) +
+ x_pixels_skip;
+ }
+
+ // Check that we are not outside of the bounds for the current rendering
+ // rect. Not doing so might result in overwriting some rows that have been
+ // written (or will be written) by other threads.
+ if (y < 0 || y >= ssize_t(image_area_rect.ysize())) {
+ continue;
+ }
+
+ // Avoid running pipeline stages on pixels that are outside the full image
+ // area. As trailing stages have no borders, this is a free optimization
+ // (and may be necessary for correctness, as some stages assume coordinates
+ // are within bounds).
+ ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y;
+ if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) {
+ continue;
+ }
+
+ for (size_t i = first_trailing_stage_; i < stages_.size(); i++) {
+ // Before the first_image_dim_stage_, coordinates are relative to the
+ // current frame.
+ size_t x0 =
+ i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0;
+ size_t y =
+ i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y;
+ stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows,
+ /*xextra=*/0, full_image_x1 - full_image_x0, x0, y,
+ thread_id);
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) {
+ if (rect.xsize() == 0) return;
+ size_t numc = channel_shifts_[0].size();
+ RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1));
+ RenderPipelineStage::RowInfo output_rows;
+
+ for (size_t c = 0; c < numc; c++) {
+ input_rows[c][0] = out_of_frame_data_[thread_id].Row(c);
+ }
+
+ for (size_t y = 0; y < rect.ysize(); y++) {
+ stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow(
+ input_rows, rect.xsize(), rect.x0(), rect.y0() + y);
+ for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+ stages_[i]->ProcessRow(input_rows, output_rows,
+ /*xextra=*/0, rect.xsize(), rect.x0(),
+ rect.y0() + y, thread_id);
+ }
+ }
+}
+
+void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id,
+ size_t thread_id) {
+ std::vector<ImageF>& input_data =
+ group_data_[use_group_ids_ ? group_id : thread_id];
+
+ // Copy the group borders to the border storage.
+ for (size_t c = 0; c < input_data.size(); c++) {
+ SaveBorders(group_id, c, input_data[c]);
+ }
+
+ size_t gy = group_id / frame_dimensions_.xsize_groups;
+ size_t gx = group_id % frame_dimensions_.xsize_groups;
+
+ if (first_image_dim_stage_ != stages_.size()) {
+ size_t group_dim = frame_dimensions_.group_dim << base_color_shift_;
+ RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim,
+ group_dim);
+ RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+ frame_dimensions_.ysize_upsampled);
+ RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+ group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+ image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+ image_rect = image_rect.Intersection(full_image_rect);
+ group_rect = group_rect.Intersection(image_rect);
+ size_t x0 = group_rect.x0();
+ size_t y0 = group_rect.y0();
+ size_t x1 = group_rect.x1();
+ size_t y1 = group_rect.y1();
+ JXL_DEBUG_V(6,
+ "Rendering padding for full image rect %s "
+ "outside group rect %s",
+ Description(full_image_rect).c_str(),
+ Description(group_rect).c_str());
+
+ if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) {
+ // If this frame does not intersect with the full image, we have to
+ // initialize the whole image area with RenderPadding.
+ RenderPadding(thread_id,
+ Rect(0, 0, full_image_xsize_, full_image_ysize_));
+ }
+
+ // Render padding for groups that intersect with the full image. The case
+ // where no groups intersect was handled above.
+ if (group_rect.xsize() > 0 && group_rect.ysize() > 0) {
+ if (gx == 0 && gy == 0) {
+ RenderPadding(thread_id, Rect(0, 0, x0, y0));
+ }
+ if (gy == 0) {
+ RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0));
+ }
+ if (gx == 0) {
+ RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0));
+ }
+ if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) {
+ RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1));
+ }
+ if (gy + 1 == frame_dimensions_.ysize_groups) {
+ RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1));
+ }
+ if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) {
+ RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0));
+ }
+ if (gx + 1 == frame_dimensions_.xsize_groups) {
+ RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0));
+ }
+ if (gy + 1 == frame_dimensions_.ysize_groups &&
+ gx + 1 == frame_dimensions_.xsize_groups) {
+ RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1,
+ full_image_ysize_ - y1));
+ }
+ }
+ }
+
+ Rect ready_rects[GroupBorderAssigner::kMaxToFinalize];
+ size_t num_ready_rects = 0;
+ group_border_assigner_.GroupDone(group_id, group_border_.first,
+ group_border_.second, ready_rects,
+ &num_ready_rects);
+ for (size_t i = 0; i < num_ready_rects; i++) {
+ const Rect& image_max_color_channel_rect = ready_rects[i];
+ for (size_t c = 0; c < input_data.size(); c++) {
+ LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]);
+ }
+ Rect data_max_color_channel_rect(
+ group_data_x_border_ + image_max_color_channel_rect.x0() -
+ gx * frame_dimensions_.group_dim,
+ group_data_y_border_ + image_max_color_channel_rect.y0() -
+ gy * frame_dimensions_.group_dim,
+ image_max_color_channel_rect.xsize(),
+ image_max_color_channel_rect.ysize());
+ RenderRect(thread_id, input_data, data_max_color_channel_rect,
+ image_max_color_channel_rect);
+ }
+}
+} // namespace jxl