diff options
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc')
-rw-r--r-- | third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc | 865 |
1 files changed, 865 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc new file mode 100644 index 0000000000..db60a458db --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc @@ -0,0 +1,865 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" + +#include <algorithm> +#include <queue> +#include <tuple> + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +std::pair<size_t, size_t> +LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions( + std::pair<size_t, size_t> in, size_t c, size_t stage) const { + std::pair<size_t, size_t> ret; + std::pair<size_t, size_t> shift = channel_shifts_[stage][c]; + ret.first = + ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first; + ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >> + shift.second; + return ret; +} + +std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore( + size_t c) const { + auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0); + ret.first += padding_[0][c].first; + ret.second += padding_[0][c].second; + return ret; +} + +void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c, + const ImageF& in) { + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + size_t hshift = channel_shifts_[0][c].first; + size_t vshift = channel_shifts_[0][c].second; + size_t x0 = gx * GroupInputXSize(c); + size_t x1 = std::min((gx + 1) * GroupInputXSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + size_t y0 = gy * GroupInputYSize(c); + size_t y1 = std::min((gy + 1) * GroupInputYSize(c), + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + auto borders = BorderToStore(c); + size_t borderx_write = borders.first; + size_t bordery_write = borders.second; + + if (gy > 0) { + Rect from(group_data_x_border_, group_data_y_border_, x1 - x0, + bordery_write); + Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write); + CopyImageTo(from, in, to, &borders_horizontal_[c]); + } + if (gy + 1 < frame_dimensions_.ysize_groups) { + Rect from(group_data_x_border_, + group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0, + bordery_write); + Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write); + CopyImageTo(from, in, to, &borders_horizontal_[c]); + } + if (gx > 0) { + Rect from(group_data_x_border_, group_data_y_border_, borderx_write, + y1 - y0); + Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0); + CopyImageTo(from, in, to, &borders_vertical_[c]); + } + if (gx + 1 < frame_dimensions_.xsize_groups) { + Rect from(group_data_x_border_ + x1 - x0 - borderx_write, + group_data_y_border_, borderx_write, y1 - y0); + Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0); + CopyImageTo(from, in, to, &borders_vertical_[c]); + } +} + +void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c, + const Rect& r, ImageF* out) { + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + size_t hshift = channel_shifts_[0][c].first; + size_t vshift = channel_shifts_[0][c].second; + // Coordinates of the group in the image. + size_t x0 = gx * GroupInputXSize(c); + size_t x1 = std::min((gx + 1) * GroupInputXSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + size_t y0 = gy * GroupInputYSize(c); + size_t y1 = std::min((gy + 1) * GroupInputYSize(c), + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + size_t paddingx = padding_[0][c].first; + size_t paddingy = padding_[0][c].second; + + auto borders = BorderToStore(c); + size_t borderx_write = borders.first; + size_t bordery_write = borders.second; + + // Limits of the area to copy from, in image coordinates. + JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx); + size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift); + if (x0src != 0) { + x0src -= paddingx; + } + // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the + // right side of the image, so we use min() here. + size_t x1src = + DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift); + x1src = std::min(x1src + paddingx, + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + + // Similar computation for y. + JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy); + size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift); + if (y0src != 0) { + y0src -= paddingy; + } + size_t y1src = + DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift); + y1src = std::min(y1src + paddingy, + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + // Copy other groups' borders from the border storage. + if (y0src < y0) { + JXL_DASSERT(gy > 0); + CopyImageTo( + Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write), + borders_horizontal_[c], + Rect(group_data_x_border_ + x0src - x0, + group_data_y_border_ - bordery_write, x1src - x0src, + bordery_write), + out); + } + if (y1src > y1) { + // When copying the bottom border we must not be on the bottom groups. + JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups); + CopyImageTo( + Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write), + borders_horizontal_[c], + Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0, + x1src - x0src, bordery_write), + out); + } + if (x0src < x0) { + JXL_DASSERT(gx > 0); + CopyImageTo( + Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src), + borders_vertical_[c], + Rect(group_data_x_border_ - borderx_write, + group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src), + out); + } + if (x1src > x1) { + // When copying the right border we must not be on the rightmost groups. + JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups); + CopyImageTo( + Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src), + borders_vertical_[c], + Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0, + borderx_write, y1src - y0src), + out); + } +} + +size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const { + return (frame_dimensions_.group_dim << base_color_shift_) >> + channel_shifts_[0][c].first; +} + +size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const { + return (frame_dimensions_.group_dim << base_color_shift_) >> + channel_shifts_[0][c].second; +} + +void LowMemoryRenderPipeline::EnsureBordersStorage() { + const auto& shifts = channel_shifts_[0]; + if (borders_horizontal_.size() < shifts.size()) { + borders_horizontal_.resize(shifts.size()); + borders_vertical_.resize(shifts.size()); + } + for (size_t c = 0; c < shifts.size(); c++) { + auto borders = BorderToStore(c); + size_t borderx = borders.first; + size_t bordery = borders.second; + JXL_DASSERT(frame_dimensions_.xsize_groups > 0); + size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2; + JXL_DASSERT(frame_dimensions_.ysize_groups > 0); + size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2; + size_t downsampled_xsize = + DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first); + size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded, + 1 << shifts[c].second); + Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders); + if (!SameSize(horizontal, borders_horizontal_[c])) { + borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize()); + } + Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize); + if (!SameSize(vertical, borders_vertical_[c])) { + borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize()); + } + } +} + +void LowMemoryRenderPipeline::Init() { + group_border_ = {0, 0}; + base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / + frame_dimensions_.xsize_padded); + + const auto& shifts = channel_shifts_[0]; + + // Ensure that each channel has enough many border pixels. + for (size_t c = 0; c < shifts.size(); c++) { + group_border_.first = + std::max(group_border_.first, + DivCeil(padding_[0][c].first << channel_shifts_[0][c].first, + 1 << base_color_shift_)); + group_border_.second = + std::max(group_border_.second, + DivCeil(padding_[0][c].second << channel_shifts_[0][c].second, + 1 << base_color_shift_)); + } + + // Ensure that all channels have an integer number of border pixels in the + // input. + for (size_t c = 0; c < shifts.size(); c++) { + if (channel_shifts_[0][c].first >= base_color_shift_) { + group_border_.first = + RoundUpTo(group_border_.first, + 1 << (channel_shifts_[0][c].first - base_color_shift_)); + } + if (channel_shifts_[0][c].second >= base_color_shift_) { + group_border_.second = + RoundUpTo(group_border_.second, + 1 << (channel_shifts_[0][c].second - base_color_shift_)); + } + } + // Ensure that the X border on color channels is a multiple of kBlockDim or + // the vector size (required for EPF stages). Vectors on ARM NEON are never + // wider than 4 floats, so rounding to multiples of 4 is enough. +#if JXL_ARCH_ARM + constexpr size_t kGroupXAlign = 4; +#else + constexpr size_t kGroupXAlign = 16; +#endif + group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign); + // Allocate borders in group images that are just enough for storing the + // borders to be copied in, plus any rounding to ensure alignment. + std::pair<size_t, size_t> max_border = {0, 0}; + for (size_t c = 0; c < shifts.size(); c++) { + max_border.first = std::max(BorderToStore(c).first, max_border.first); + max_border.second = std::max(BorderToStore(c).second, max_border.second); + } + group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign); + group_data_y_border_ = max_border.second; + + EnsureBordersStorage(); + group_border_assigner_.Init(frame_dimensions_); + + for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0; + first_trailing_stage_--) { + bool has_inout_c = false; + for (size_t c = 0; c < shifts.size(); c++) { + if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) == + RenderPipelineChannelMode::kInOut) { + has_inout_c = true; + } + } + if (has_inout_c) { + break; + } + } + + first_image_dim_stage_ = stages_.size(); + for (size_t i = 0; i < stages_.size(); i++) { + std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + input_sizes[c] = + std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[i][c].first), + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[i][c].second)); + } + stages_[i]->SetInputSizes(input_sizes); + if (stages_[i]->SwitchToImageDimensions()) { + // We don't allow kInOut after switching to image dimensions. + JXL_ASSERT(i >= first_trailing_stage_); + first_image_dim_stage_ = i + 1; + stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_, + &frame_origin_); + break; + } + } + for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { + if (stages_[i]->SwitchToImageDimensions()) { + JXL_ABORT("Cannot switch to image dimensions multiple times"); + } + std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + input_sizes[c] = {full_image_xsize_, full_image_ysize_}; + } + stages_[i]->SetInputSizes(input_sizes); + } + + anyc_.resize(stages_.size()); + for (size_t i = 0; i < stages_.size(); i++) { + for (size_t c = 0; c < shifts.size(); c++) { + if (stages_[i]->GetChannelMode(c) != + RenderPipelineChannelMode::kIgnored) { + anyc_[i] = c; + } + } + } + + stage_input_for_channel_ = std::vector<std::vector<int32_t>>( + stages_.size(), std::vector<int32_t>(shifts.size())); + for (size_t c = 0; c < shifts.size(); c++) { + int input = -1; + for (size_t i = 0; i < stages_.size(); i++) { + stage_input_for_channel_[i][c] = input; + if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + input = i; + } + } + } + + image_rect_.resize(stages_.size()); + for (size_t i = 0; i < stages_.size(); i++) { + size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[i][anyc_[i]].first); + size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[i][anyc_[i]].second); + image_rect_[i] = Rect(0, 0, x1, y1); + } + + virtual_ypadding_for_output_.resize(stages_.size()); + xpadding_for_output_.resize(stages_.size()); + for (size_t c = 0; c < shifts.size(); c++) { + int ypad = 0; + int xpad = 0; + for (size_t i = stages_.size(); i-- > 0;) { + if (stages_[i]->GetChannelMode(c) != + RenderPipelineChannelMode::kIgnored) { + virtual_ypadding_for_output_[i] = + std::max(ypad, virtual_ypadding_for_output_[i]); + xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]); + } + if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) + + stages_[i]->settings_.border_y) + << channel_shifts_[i][c].second; + xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) + + stages_[i]->settings_.border_x; + } + } + } +} + +void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num, + bool use_group_ids) { + const auto& shifts = channel_shifts_[0]; + + use_group_ids_ = use_group_ids; + size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num; + for (size_t t = group_data_.size(); t < num_buffers; t++) { + group_data_.emplace_back(); + group_data_[t].resize(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2, + GroupInputYSize(c) + group_data_y_border_ * 2); + } + } + // TODO(veluca): avoid reallocating buffers if not needed. + stage_data_.resize(num); + size_t upsampling = 1u << base_color_shift_; + size_t group_dim = frame_dimensions_.group_dim * upsampling; + size_t padding = + 2 * group_data_x_border_ * upsampling + // maximum size of a rect + 2 * kRenderPipelineXOffset; // extra padding for processing + size_t stage_buffer_xsize = group_dim + padding; + for (size_t t = 0; t < num; t++) { + stage_data_[t].resize(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + stage_data_[t][c].resize(stages_.size()); + size_t next_y_border = 0; + for (size_t i = stages_.size(); i-- > 0;) { + if (stages_[i]->GetChannelMode(c) == + RenderPipelineChannelMode::kInOut) { + size_t stage_buffer_ysize = + 2 * next_y_border + (1 << stages_[i]->settings_.shift_y); + stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize); + next_y_border = stages_[i]->settings_.border_y; + stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize); + } + } + } + } + if (first_image_dim_stage_ != stages_.size()) { + RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); + image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Intersection(full_image_rect); + if (image_rect.xsize() == 0 || image_rect.ysize() == 0) { + image_rect = RectT<ssize_t>(0, 0, 0, 0); + } + size_t left_padding = image_rect.x0(); + size_t middle_padding = group_dim; + size_t right_padding = full_image_xsize_ - image_rect.x1(); + size_t out_of_frame_xsize = + padding + + std::max(left_padding, std::max(middle_padding, right_padding)); + out_of_frame_data_.resize(num); + for (size_t t = 0; t < num; t++) { + out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size()); + } + } +} + +std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers( + size_t group_id, size_t thread_id) { + std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size()); + const size_t gx = group_id % frame_dimensions_.xsize_groups; + const size_t gy = group_id / frame_dimensions_.xsize_groups; + for (size_t c = 0; c < channel_shifts_[0].size(); c++) { + ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c]; + ret[c].second = Rect(group_data_x_border_, group_data_y_border_, + GroupInputXSize(c), GroupInputYSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[0][c].first) - + gx * GroupInputXSize(c) + group_data_x_border_, + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[0][c].second) - + gy * GroupInputYSize(c) + group_data_y_border_); + } + return ret; +} + +namespace { + +JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) { + if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) { + return Mirror(y, image_ysize); + } + if (y + group_y0 >= image_ysize) { + // Here we know that the one mirroring step is sufficient. + return 2 * image_ysize - (y + group_y0) - 1 - group_y0; + } + return y; +} + +JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0, + ssize_t group_xsize, ssize_t image_xsize) { + if (image_xsize <= borderx) { + if (group_x0 == 0) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset - ix - 1] = + row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)]; + } + } + if (group_xsize + borderx + group_x0 >= image_xsize) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset + image_xsize + ix - group_x0] = + row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) - + group_x0]; + } + } + } else { + // Here we know that the one mirroring step is sufficient. + if (group_x0 == 0) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix]; + } + } + if (group_xsize + borderx + group_x0 >= image_xsize) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] = + row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1]; + } + } + } +} + +// Information about where the *output* of each stage is stored. +class Rows { + public: + Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages, + const Rect data_max_color_channel_rect, int group_data_x_border, + int group_data_y_border, + const std::vector<std::pair<size_t, size_t>>& group_data_shift, + size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data, + std::vector<ImageF>& input_data) { + size_t num_stages = stages.size(); + size_t num_channels = input_data.size(); + + JXL_ASSERT(thread_data.size() == num_channels); + JXL_ASSERT(group_data_shift.size() == num_channels); + +#if JXL_ENABLE_ASSERT + for (const auto& td : thread_data) { + JXL_ASSERT(td.size() == num_stages); + } +#endif + + rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels)); + + for (size_t i = 0; i < num_stages; i++) { + for (size_t c = 0; c < input_data.size(); c++) { + if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1; + rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0); + rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow(); + } + } + } + + for (size_t c = 0; c < input_data.size(); c++) { + auto channel_group_data_rect = + data_max_color_channel_rect.As<ssize_t>() + .Translate(-group_data_x_border, -group_data_y_border) + .ShiftLeft(base_color_shift) + .CeilShiftRight(group_data_shift[c]) + .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset), + group_data_y_border); + rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0); + rows_[0][c].stride = input_data[c].PixelsPerRow(); + rows_[0][c].ymod_minus_1 = -1; + } + } + + // Stage -1 refers to the input data; all other values must be nonnegative and + // refer to the data for the output of that stage. + JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const { + JXL_DASSERT(stage >= -1); + const RowInfo& info = rows_[stage + 1][c]; + return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1); + } + + private: + struct RowInfo { + // Pointer to beginning of the first row. + float* base_ptr; + // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of + // 2, which allows efficient mod computation by masking). + int ymod_minus_1; + // Number of floats per row. + size_t stride; + }; + std::vector<std::vector<RowInfo>> rows_; +}; + +} // namespace + +void LowMemoryRenderPipeline::RenderRect(size_t thread_id, + std::vector<ImageF>& input_data, + Rect data_max_color_channel_rect, + Rect image_max_color_channel_rect) { + // For each stage, the rect corresponding to the image area currently being + // processed, in the coordinates of that stage (i.e. with the scaling factor + // that that stage has). + std::vector<Rect> group_rect; + group_rect.resize(stages_.size()); + Rect image_area_rect = + image_max_color_channel_rect.ShiftLeft(base_color_shift_) + .Crop(frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + for (size_t i = 0; i < stages_.size(); i++) { + group_rect[i] = + image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]); + } + + ssize_t frame_x0 = + first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0; + ssize_t frame_y0 = + first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0; + size_t full_image_xsize = first_image_dim_stage_ == stages_.size() + ? frame_dimensions_.xsize_upsampled + : full_image_xsize_; + size_t full_image_ysize = first_image_dim_stage_ == stages_.size() + ? frame_dimensions_.ysize_upsampled + : full_image_ysize_; + + // Compute actual x-axis bounds for the current image area in the context of + // the full image this frame is part of. As the left boundary may be negative, + // we also create the x_pixels_skip value, defined as follows: + // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0; + // - full_image_x0 - x_pixels_skip is the position of the current frame area + // in the full image. + ssize_t full_image_x0 = frame_x0 + image_area_rect.x0(); + ssize_t x_pixels_skip = 0; + if (full_image_x0 < 0) { + x_pixels_skip = -full_image_x0; + full_image_x0 = 0; + } + ssize_t full_image_x1 = frame_x0 + image_area_rect.x1(); + full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize); + + // If the current image area is entirely outside of the visible image, there + // is no point in proceeding. Note: this uses the assumption that if there is + // a stage with observable effects (i.e. a kInput stage), it only appears + // after the stage that switches to image dimensions. + if (full_image_x1 <= full_image_x0) return; + + // Data structures to hold information about input/output rows and their + // buffers. + Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_, + group_data_y_border_, channel_shifts_[0], base_color_shift_, + stage_data_[thread_id], input_data); + + std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ + + 1); + for (size_t i = 0; i < first_trailing_stage_; i++) { + input_rows[i].resize(input_data.size()); + } + input_rows[first_trailing_stage_].resize(input_data.size(), + std::vector<float*>(1)); + + // Maximum possible shift is 3. + RenderPipelineStage::RowInfo output_rows(input_data.size(), + std::vector<float*>(8)); + + // Fills in input_rows and output_rows for a given y value (relative to the + // start of the group, measured in actual pixels at the appropriate vertical + // scaling factor) and a given stage, applying mirroring if necessary. This + // function is somewhat inefficient for trailing kInOut or kInput stages, + // where just filling the input row once ought to be sufficient. + auto prepare_io_rows = [&](int y, size_t i) { + ssize_t bordery = stages_[i]->settings_.border_y; + size_t shifty = stages_[i]->settings_.shift_y; + auto make_row = [&](size_t c, ssize_t iy) { + size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(), + image_rect_[i].ysize()); + input_rows[i][c][iy] = + rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c); + ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x, + group_rect[i].x0(), group_rect[i].xsize(), + image_rect_[i].xsize()); + }; + for (size_t c = 0; c < input_data.size(); c++) { + RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c); + if (mode == RenderPipelineChannelMode::kIgnored) { + continue; + } + // If we already have rows from a previous iteration, we can just shift + // the rows by 1 and insert the new one. + if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) { + for (ssize_t iy = 0; iy < 2 * bordery; iy++) { + input_rows[i][c][iy] = input_rows[i][c][iy + 1]; + } + make_row(c, bordery * 2); + } else { + input_rows[i][c].resize(2 * bordery + 1); + for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) { + make_row(c, iy); + } + } + + // If necessary, get the output buffers. + if (mode == RenderPipelineChannelMode::kInOut) { + for (size_t iy = 0; iy < (1u << shifty); iy++) { + output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c); + } + } + } + }; + + // We pretend that every stage has a vertical shift of 0, i.e. it is as tall + // as the final image. + // We call each such row a "virtual" row, because it may or may not correspond + // to an actual row of the current processing stage; actual processing happens + // when vy % (1<<vshift) == 0. + + int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(), + virtual_ypadding_for_output_.end()); + + for (int vy = -num_extra_rows; + vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) { + for (size_t i = 0; i < first_trailing_stage_; i++) { + int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i]; + + if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) { + continue; + } + + if (stage_vy < -virtual_ypadding_for_output_[i]) { + continue; + } + + int y = stage_vy >> channel_shifts_[i][anyc_[i]].second; + + ssize_t image_y = ssize_t(group_rect[i].y0()) + y; + // Do not produce rows in out-of-bounds areas. + if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) { + continue; + } + + // Get the input/output rows and potentially apply mirroring to the input. + prepare_io_rows(y, i); + + // Produce output rows. + stages_[i]->ProcessRow(input_rows[i], output_rows, + xpadding_for_output_[i], group_rect[i].xsize(), + group_rect[i].x0(), image_y, thread_id); + } + + // Process trailing stages, i.e. the final set of non-kInOut stages; they + // all have the same input buffer and no need to use any mirroring. + + int y = vy - num_extra_rows; + + for (size_t c = 0; c < input_data.size(); c++) { + // Skip pixels that are not part of the actual final image area. + input_rows[first_trailing_stage_][c][0] = + rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y, + c) + + x_pixels_skip; + } + + // Check that we are not outside of the bounds for the current rendering + // rect. Not doing so might result in overwriting some rows that have been + // written (or will be written) by other threads. + if (y < 0 || y >= ssize_t(image_area_rect.ysize())) { + continue; + } + + // Avoid running pipeline stages on pixels that are outside the full image + // area. As trailing stages have no borders, this is a free optimization + // (and may be necessary for correctness, as some stages assume coordinates + // are within bounds). + ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y; + if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) { + continue; + } + + for (size_t i = first_trailing_stage_; i < stages_.size(); i++) { + // Before the first_image_dim_stage_, coordinates are relative to the + // current frame. + size_t x0 = + i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0; + size_t y = + i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y; + stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows, + /*xextra=*/0, full_image_x1 - full_image_x0, x0, y, + thread_id); + } + } +} + +void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) { + if (rect.xsize() == 0) return; + size_t numc = channel_shifts_[0].size(); + RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1)); + RenderPipelineStage::RowInfo output_rows; + + for (size_t c = 0; c < numc; c++) { + input_rows[c][0] = out_of_frame_data_[thread_id].Row(c); + } + + for (size_t y = 0; y < rect.ysize(); y++) { + stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow( + input_rows, rect.xsize(), rect.x0(), rect.y0() + y); + for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { + stages_[i]->ProcessRow(input_rows, output_rows, + /*xextra=*/0, rect.xsize(), rect.x0(), + rect.y0() + y, thread_id); + } + } +} + +void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id, + size_t thread_id) { + std::vector<ImageF>& input_data = + group_data_[use_group_ids_ ? group_id : thread_id]; + + // Copy the group borders to the border storage. + for (size_t c = 0; c < input_data.size(); c++) { + SaveBorders(group_id, c, input_data[c]); + } + + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + + if (first_image_dim_stage_ != stages_.size()) { + size_t group_dim = frame_dimensions_.group_dim << base_color_shift_; + RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim, + group_dim); + RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); + group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Intersection(full_image_rect); + group_rect = group_rect.Intersection(image_rect); + size_t x0 = group_rect.x0(); + size_t y0 = group_rect.y0(); + size_t x1 = group_rect.x1(); + size_t y1 = group_rect.y1(); + JXL_DEBUG_V(6, + "Rendering padding for full image rect %s " + "outside group rect %s", + Description(full_image_rect).c_str(), + Description(group_rect).c_str()); + + if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) { + // If this frame does not intersect with the full image, we have to + // initialize the whole image area with RenderPadding. + RenderPadding(thread_id, + Rect(0, 0, full_image_xsize_, full_image_ysize_)); + } + + // Render padding for groups that intersect with the full image. The case + // where no groups intersect was handled above. + if (group_rect.xsize() > 0 && group_rect.ysize() > 0) { + if (gx == 0 && gy == 0) { + RenderPadding(thread_id, Rect(0, 0, x0, y0)); + } + if (gy == 0) { + RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0)); + } + if (gx == 0) { + RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0)); + } + if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) { + RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1)); + } + if (gy + 1 == frame_dimensions_.ysize_groups) { + RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1)); + } + if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0)); + } + if (gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0)); + } + if (gy + 1 == frame_dimensions_.ysize_groups && + gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1, + full_image_ysize_ - y1)); + } + } + } + + Rect ready_rects[GroupBorderAssigner::kMaxToFinalize]; + size_t num_ready_rects = 0; + group_border_assigner_.GroupDone(group_id, group_border_.first, + group_border_.second, ready_rects, + &num_ready_rects); + for (size_t i = 0; i < num_ready_rects; i++) { + const Rect& image_max_color_channel_rect = ready_rects[i]; + for (size_t c = 0; c < input_data.size(); c++) { + LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]); + } + Rect data_max_color_channel_rect( + group_data_x_border_ + image_max_color_channel_rect.x0() - + gx * frame_dimensions_.group_dim, + group_data_y_border_ + image_max_color_channel_rect.y0() - + gy * frame_dimensions_.group_dim, + image_max_color_channel_rect.xsize(), + image_max_color_channel_rect.ysize()); + RenderRect(thread_id, input_data, data_max_color_channel_rect, + image_max_color_channel_rect); + } +} +} // namespace jxl |