// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" #include #include #include #include "lib/jxl/base/arch_macros.h" #include "lib/jxl/image_ops.h" namespace jxl { std::pair LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions( std::pair in, size_t c, size_t stage) const { std::pair ret; std::pair shift = channel_shifts_[stage][c]; ret.first = ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first; ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >> shift.second; return ret; } std::pair LowMemoryRenderPipeline::BorderToStore( size_t c) const { auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0); ret.first += padding_[0][c].first; ret.second += padding_[0][c].second; return ret; } void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c, const ImageF& in) { size_t gy = group_id / frame_dimensions_.xsize_groups; size_t gx = group_id % frame_dimensions_.xsize_groups; size_t hshift = channel_shifts_[0][c].first; size_t vshift = channel_shifts_[0][c].second; size_t x0 = gx * GroupInputXSize(c); size_t x1 = std::min((gx + 1) * GroupInputXSize(c), DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); size_t y0 = gy * GroupInputYSize(c); size_t y1 = std::min((gy + 1) * GroupInputYSize(c), DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); auto borders = BorderToStore(c); size_t borderx_write = borders.first; size_t bordery_write = borders.second; if (gy > 0) { Rect from(group_data_x_border_, group_data_y_border_, x1 - x0, bordery_write); Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write); CopyImageTo(from, in, to, &borders_horizontal_[c]); } if (gy + 1 < frame_dimensions_.ysize_groups) { Rect from(group_data_x_border_, group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0, bordery_write); Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write); CopyImageTo(from, in, to, &borders_horizontal_[c]); } if (gx > 0) { Rect from(group_data_x_border_, group_data_y_border_, borderx_write, y1 - y0); Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0); CopyImageTo(from, in, to, &borders_vertical_[c]); } if (gx + 1 < frame_dimensions_.xsize_groups) { Rect from(group_data_x_border_ + x1 - x0 - borderx_write, group_data_y_border_, borderx_write, y1 - y0); Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0); CopyImageTo(from, in, to, &borders_vertical_[c]); } } void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out) { size_t gy = group_id / frame_dimensions_.xsize_groups; size_t gx = group_id % frame_dimensions_.xsize_groups; size_t hshift = channel_shifts_[0][c].first; size_t vshift = channel_shifts_[0][c].second; // Coordinates of the group in the image. size_t x0 = gx * GroupInputXSize(c); size_t x1 = std::min((gx + 1) * GroupInputXSize(c), DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); size_t y0 = gy * GroupInputYSize(c); size_t y1 = std::min((gy + 1) * GroupInputYSize(c), DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); size_t paddingx = padding_[0][c].first; size_t paddingy = padding_[0][c].second; auto borders = BorderToStore(c); size_t borderx_write = borders.first; size_t bordery_write = borders.second; // Limits of the area to copy from, in image coordinates. JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx); size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift); if (x0src != 0) { x0src -= paddingx; } // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the // right side of the image, so we use min() here. size_t x1src = DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift); x1src = std::min(x1src + paddingx, DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); // Similar computation for y. JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy); size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift); if (y0src != 0) { y0src -= paddingy; } size_t y1src = DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift); y1src = std::min(y1src + paddingy, DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); // Copy other groups' borders from the border storage. if (y0src < y0) { JXL_DASSERT(gy > 0); CopyImageTo( Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write), borders_horizontal_[c], Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ - bordery_write, x1src - x0src, bordery_write), out); } if (y1src > y1) { // When copying the bottom border we must not be on the bottom groups. JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups); CopyImageTo( Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write), borders_horizontal_[c], Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0, x1src - x0src, bordery_write), out); } if (x0src < x0) { JXL_DASSERT(gx > 0); CopyImageTo( Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src), borders_vertical_[c], Rect(group_data_x_border_ - borderx_write, group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src), out); } if (x1src > x1) { // When copying the right border we must not be on the rightmost groups. JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups); CopyImageTo( Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src), borders_vertical_[c], Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src), out); } } size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const { return (frame_dimensions_.group_dim << base_color_shift_) >> channel_shifts_[0][c].first; } size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const { return (frame_dimensions_.group_dim << base_color_shift_) >> channel_shifts_[0][c].second; } void LowMemoryRenderPipeline::EnsureBordersStorage() { const auto& shifts = channel_shifts_[0]; if (borders_horizontal_.size() < shifts.size()) { borders_horizontal_.resize(shifts.size()); borders_vertical_.resize(shifts.size()); } for (size_t c = 0; c < shifts.size(); c++) { auto borders = BorderToStore(c); size_t borderx = borders.first; size_t bordery = borders.second; JXL_DASSERT(frame_dimensions_.xsize_groups > 0); size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2; JXL_DASSERT(frame_dimensions_.ysize_groups > 0); size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2; size_t downsampled_xsize = DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first); size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded, 1 << shifts[c].second); Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders); if (!SameSize(horizontal, borders_horizontal_[c])) { borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize()); } Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize); if (!SameSize(vertical, borders_vertical_[c])) { borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize()); } } } void LowMemoryRenderPipeline::Init() { group_border_ = {0, 0}; base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / frame_dimensions_.xsize_padded); const auto& shifts = channel_shifts_[0]; // Ensure that each channel has enough many border pixels. for (size_t c = 0; c < shifts.size(); c++) { group_border_.first = std::max(group_border_.first, DivCeil(padding_[0][c].first << channel_shifts_[0][c].first, 1 << base_color_shift_)); group_border_.second = std::max(group_border_.second, DivCeil(padding_[0][c].second << channel_shifts_[0][c].second, 1 << base_color_shift_)); } // Ensure that all channels have an integer number of border pixels in the // input. for (size_t c = 0; c < shifts.size(); c++) { if (channel_shifts_[0][c].first >= base_color_shift_) { group_border_.first = RoundUpTo(group_border_.first, 1 << (channel_shifts_[0][c].first - base_color_shift_)); } if (channel_shifts_[0][c].second >= base_color_shift_) { group_border_.second = RoundUpTo(group_border_.second, 1 << (channel_shifts_[0][c].second - base_color_shift_)); } } // Ensure that the X border on color channels is a multiple of kBlockDim or // the vector size (required for EPF stages). Vectors on ARM NEON are never // wider than 4 floats, so rounding to multiples of 4 is enough. #if JXL_ARCH_ARM constexpr size_t kGroupXAlign = 4; #else constexpr size_t kGroupXAlign = 16; #endif group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign); // Allocate borders in group images that are just enough for storing the // borders to be copied in, plus any rounding to ensure alignment. std::pair max_border = {0, 0}; for (size_t c = 0; c < shifts.size(); c++) { max_border.first = std::max(BorderToStore(c).first, max_border.first); max_border.second = std::max(BorderToStore(c).second, max_border.second); } group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign); group_data_y_border_ = max_border.second; EnsureBordersStorage(); group_border_assigner_.Init(frame_dimensions_); for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0; first_trailing_stage_--) { bool has_inout_c = false; for (size_t c = 0; c < shifts.size(); c++) { if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { has_inout_c = true; } } if (has_inout_c) { break; } } first_image_dim_stage_ = stages_.size(); for (size_t i = 0; i < stages_.size(); i++) { std::vector> input_sizes(shifts.size()); for (size_t c = 0; c < shifts.size(); c++) { input_sizes[c] = std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled, 1 << channel_shifts_[i][c].first), DivCeil(frame_dimensions_.ysize_upsampled, 1 << channel_shifts_[i][c].second)); } stages_[i]->SetInputSizes(input_sizes); if (stages_[i]->SwitchToImageDimensions()) { // We don't allow kInOut after switching to image dimensions. JXL_ASSERT(i >= first_trailing_stage_); first_image_dim_stage_ = i + 1; stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_, &frame_origin_); break; } } for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { if (stages_[i]->SwitchToImageDimensions()) { JXL_ABORT("Cannot switch to image dimensions multiple times"); } std::vector> input_sizes(shifts.size()); for (size_t c = 0; c < shifts.size(); c++) { input_sizes[c] = {full_image_xsize_, full_image_ysize_}; } stages_[i]->SetInputSizes(input_sizes); } anyc_.resize(stages_.size()); for (size_t i = 0; i < stages_.size(); i++) { for (size_t c = 0; c < shifts.size(); c++) { if (stages_[i]->GetChannelMode(c) != RenderPipelineChannelMode::kIgnored) { anyc_[i] = c; } } } stage_input_for_channel_ = std::vector>( stages_.size(), std::vector(shifts.size())); for (size_t c = 0; c < shifts.size(); c++) { int input = -1; for (size_t i = 0; i < stages_.size(); i++) { stage_input_for_channel_[i][c] = input; if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { input = i; } } } image_rect_.resize(stages_.size()); for (size_t i = 0; i < stages_.size(); i++) { size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled, 1 << channel_shifts_[i][anyc_[i]].first); size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled, 1 << channel_shifts_[i][anyc_[i]].second); image_rect_[i] = Rect(0, 0, x1, y1); } virtual_ypadding_for_output_.resize(stages_.size()); xpadding_for_output_.resize(stages_.size()); for (size_t c = 0; c < shifts.size(); c++) { int ypad = 0; int xpad = 0; for (size_t i = stages_.size(); i-- > 0;) { if (stages_[i]->GetChannelMode(c) != RenderPipelineChannelMode::kIgnored) { virtual_ypadding_for_output_[i] = std::max(ypad, virtual_ypadding_for_output_[i]); xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]); } if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) + stages_[i]->settings_.border_y) << channel_shifts_[i][c].second; xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) + stages_[i]->settings_.border_x; } } } } void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num, bool use_group_ids) { const auto& shifts = channel_shifts_[0]; use_group_ids_ = use_group_ids; size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num; for (size_t t = group_data_.size(); t < num_buffers; t++) { group_data_.emplace_back(); group_data_[t].resize(shifts.size()); for (size_t c = 0; c < shifts.size(); c++) { group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2, GroupInputYSize(c) + group_data_y_border_ * 2); } } // TODO(veluca): avoid reallocating buffers if not needed. stage_data_.resize(num); size_t upsampling = 1u << base_color_shift_; size_t group_dim = frame_dimensions_.group_dim * upsampling; size_t padding = 2 * group_data_x_border_ * upsampling + // maximum size of a rect 2 * kRenderPipelineXOffset; // extra padding for processing size_t stage_buffer_xsize = group_dim + padding; for (size_t t = 0; t < num; t++) { stage_data_[t].resize(shifts.size()); for (size_t c = 0; c < shifts.size(); c++) { stage_data_[t][c].resize(stages_.size()); size_t next_y_border = 0; for (size_t i = stages_.size(); i-- > 0;) { if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { size_t stage_buffer_ysize = 2 * next_y_border + (1 << stages_[i]->settings_.shift_y); stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize); next_y_border = stages_[i]->settings_.border_y; stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize); } } } } if (first_image_dim_stage_ != stages_.size()) { RectT image_rect(0, 0, frame_dimensions_.xsize_upsampled, frame_dimensions_.ysize_upsampled); RectT full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); image_rect = image_rect.Intersection(full_image_rect); if (image_rect.xsize() == 0 || image_rect.ysize() == 0) { image_rect = RectT(0, 0, 0, 0); } size_t left_padding = image_rect.x0(); size_t middle_padding = group_dim; size_t right_padding = full_image_xsize_ - image_rect.x1(); size_t out_of_frame_xsize = padding + std::max(left_padding, std::max(middle_padding, right_padding)); out_of_frame_data_.resize(num); for (size_t t = 0; t < num; t++) { out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size()); } } } std::vector> LowMemoryRenderPipeline::PrepareBuffers( size_t group_id, size_t thread_id) { std::vector> ret(channel_shifts_[0].size()); const size_t gx = group_id % frame_dimensions_.xsize_groups; const size_t gy = group_id / frame_dimensions_.xsize_groups; for (size_t c = 0; c < channel_shifts_[0].size(); c++) { ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c]; ret[c].second = Rect(group_data_x_border_, group_data_y_border_, GroupInputXSize(c), GroupInputYSize(c), DivCeil(frame_dimensions_.xsize_upsampled, 1 << channel_shifts_[0][c].first) - gx * GroupInputXSize(c) + group_data_x_border_, DivCeil(frame_dimensions_.ysize_upsampled, 1 << channel_shifts_[0][c].second) - gy * GroupInputYSize(c) + group_data_y_border_); } return ret; } namespace { JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) { if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) { return Mirror(y, image_ysize); } if (y + group_y0 >= image_ysize) { // Here we know that the one mirroring step is sufficient. return 2 * image_ysize - (y + group_y0) - 1 - group_y0; } return y; } JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0, ssize_t group_xsize, ssize_t image_xsize) { if (image_xsize <= borderx) { if (group_x0 == 0) { for (ssize_t ix = 0; ix < borderx; ix++) { row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)]; } } if (group_xsize + borderx + group_x0 >= image_xsize) { for (ssize_t ix = 0; ix < borderx; ix++) { row[kRenderPipelineXOffset + image_xsize + ix - group_x0] = row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) - group_x0]; } } } else { // Here we know that the one mirroring step is sufficient. if (group_x0 == 0) { for (ssize_t ix = 0; ix < borderx; ix++) { row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix]; } } if (group_xsize + borderx + group_x0 >= image_xsize) { for (ssize_t ix = 0; ix < borderx; ix++) { row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] = row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1]; } } } } // Information about where the *output* of each stage is stored. class Rows { public: Rows(const std::vector>& stages, const Rect data_max_color_channel_rect, int group_data_x_border, int group_data_y_border, const std::vector>& group_data_shift, size_t base_color_shift, std::vector>& thread_data, std::vector& input_data) { size_t num_stages = stages.size(); size_t num_channels = input_data.size(); JXL_ASSERT(thread_data.size() == num_channels); JXL_ASSERT(group_data_shift.size() == num_channels); #if JXL_ENABLE_ASSERT for (const auto& td : thread_data) { JXL_ASSERT(td.size() == num_stages); } #endif rows_.resize(num_stages + 1, std::vector(num_channels)); for (size_t i = 0; i < num_stages; i++) { for (size_t c = 0; c < input_data.size(); c++) { if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1; rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0); rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow(); } } } for (size_t c = 0; c < input_data.size(); c++) { auto channel_group_data_rect = data_max_color_channel_rect.As() .Translate(-group_data_x_border, -group_data_y_border) .ShiftLeft(base_color_shift) .CeilShiftRight(group_data_shift[c]) .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset), group_data_y_border); rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0); rows_[0][c].stride = input_data[c].PixelsPerRow(); rows_[0][c].ymod_minus_1 = -1; } } // Stage -1 refers to the input data; all other values must be nonnegative and // refer to the data for the output of that stage. JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const { JXL_DASSERT(stage >= -1); const RowInfo& info = rows_[stage + 1][c]; return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1); } private: struct RowInfo { // Pointer to beginning of the first row. float* base_ptr; // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of // 2, which allows efficient mod computation by masking). int ymod_minus_1; // Number of floats per row. size_t stride; }; std::vector> rows_; }; } // namespace void LowMemoryRenderPipeline::RenderRect(size_t thread_id, std::vector& input_data, Rect data_max_color_channel_rect, Rect image_max_color_channel_rect) { // For each stage, the rect corresponding to the image area currently being // processed, in the coordinates of that stage (i.e. with the scaling factor // that that stage has). std::vector group_rect; group_rect.resize(stages_.size()); Rect image_area_rect = image_max_color_channel_rect.ShiftLeft(base_color_shift_) .Crop(frame_dimensions_.xsize_upsampled, frame_dimensions_.ysize_upsampled); for (size_t i = 0; i < stages_.size(); i++) { group_rect[i] = image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]); } ssize_t frame_x0 = first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0; ssize_t frame_y0 = first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0; size_t full_image_xsize = first_image_dim_stage_ == stages_.size() ? frame_dimensions_.xsize_upsampled : full_image_xsize_; size_t full_image_ysize = first_image_dim_stage_ == stages_.size() ? frame_dimensions_.ysize_upsampled : full_image_ysize_; // Compute actual x-axis bounds for the current image area in the context of // the full image this frame is part of. As the left boundary may be negative, // we also create the x_pixels_skip value, defined as follows: // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0; // - full_image_x0 - x_pixels_skip is the position of the current frame area // in the full image. ssize_t full_image_x0 = frame_x0 + image_area_rect.x0(); ssize_t x_pixels_skip = 0; if (full_image_x0 < 0) { x_pixels_skip = -full_image_x0; full_image_x0 = 0; } ssize_t full_image_x1 = frame_x0 + image_area_rect.x1(); full_image_x1 = std::min(full_image_x1, full_image_xsize); // If the current image area is entirely outside of the visible image, there // is no point in proceeding. Note: this uses the assumption that if there is // a stage with observable effects (i.e. a kInput stage), it only appears // after the stage that switches to image dimensions. if (full_image_x1 <= full_image_x0) return; // Data structures to hold information about input/output rows and their // buffers. Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_, group_data_y_border_, channel_shifts_[0], base_color_shift_, stage_data_[thread_id], input_data); std::vector input_rows(first_trailing_stage_ + 1); for (size_t i = 0; i < first_trailing_stage_; i++) { input_rows[i].resize(input_data.size()); } input_rows[first_trailing_stage_].resize(input_data.size(), std::vector(1)); // Maximum possible shift is 3. RenderPipelineStage::RowInfo output_rows(input_data.size(), std::vector(8)); // Fills in input_rows and output_rows for a given y value (relative to the // start of the group, measured in actual pixels at the appropriate vertical // scaling factor) and a given stage, applying mirroring if necessary. This // function is somewhat inefficient for trailing kInOut or kInput stages, // where just filling the input row once ought to be sufficient. auto prepare_io_rows = [&](int y, size_t i) { ssize_t bordery = stages_[i]->settings_.border_y; size_t shifty = stages_[i]->settings_.shift_y; auto make_row = [&](size_t c, ssize_t iy) { size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(), image_rect_[i].ysize()); input_rows[i][c][iy] = rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c); ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x, group_rect[i].x0(), group_rect[i].xsize(), image_rect_[i].xsize()); }; for (size_t c = 0; c < input_data.size(); c++) { RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c); if (mode == RenderPipelineChannelMode::kIgnored) { continue; } // If we already have rows from a previous iteration, we can just shift // the rows by 1 and insert the new one. if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) { for (ssize_t iy = 0; iy < 2 * bordery; iy++) { input_rows[i][c][iy] = input_rows[i][c][iy + 1]; } make_row(c, bordery * 2); } else { input_rows[i][c].resize(2 * bordery + 1); for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) { make_row(c, iy); } } // If necessary, get the output buffers. if (mode == RenderPipelineChannelMode::kInOut) { for (size_t iy = 0; iy < (1u << shifty); iy++) { output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c); } } } }; // We pretend that every stage has a vertical shift of 0, i.e. it is as tall // as the final image. // We call each such row a "virtual" row, because it may or may not correspond // to an actual row of the current processing stage; actual processing happens // when vy % (1<> channel_shifts_[i][anyc_[i]].second; ssize_t image_y = ssize_t(group_rect[i].y0()) + y; // Do not produce rows in out-of-bounds areas. if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) { continue; } // Get the input/output rows and potentially apply mirroring to the input. prepare_io_rows(y, i); // Produce output rows. stages_[i]->ProcessRow(input_rows[i], output_rows, xpadding_for_output_[i], group_rect[i].xsize(), group_rect[i].x0(), image_y, thread_id); } // Process trailing stages, i.e. the final set of non-kInOut stages; they // all have the same input buffer and no need to use any mirroring. int y = vy - num_extra_rows; for (size_t c = 0; c < input_data.size(); c++) { // Skip pixels that are not part of the actual final image area. input_rows[first_trailing_stage_][c][0] = rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y, c) + x_pixels_skip; } // Check that we are not outside of the bounds for the current rendering // rect. Not doing so might result in overwriting some rows that have been // written (or will be written) by other threads. if (y < 0 || y >= ssize_t(image_area_rect.ysize())) { continue; } // Avoid running pipeline stages on pixels that are outside the full image // area. As trailing stages have no borders, this is a free optimization // (and may be necessary for correctness, as some stages assume coordinates // are within bounds). ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y; if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) { continue; } for (size_t i = first_trailing_stage_; i < stages_.size(); i++) { // Before the first_image_dim_stage_, coordinates are relative to the // current frame. size_t x0 = i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0; size_t y = i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y; stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows, /*xextra=*/0, full_image_x1 - full_image_x0, x0, y, thread_id); } } } void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) { if (rect.xsize() == 0) return; size_t numc = channel_shifts_[0].size(); RenderPipelineStage::RowInfo input_rows(numc, std::vector(1)); RenderPipelineStage::RowInfo output_rows; for (size_t c = 0; c < numc; c++) { input_rows[c][0] = out_of_frame_data_[thread_id].Row(c); } for (size_t y = 0; y < rect.ysize(); y++) { stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow( input_rows, rect.xsize(), rect.x0(), rect.y0() + y); for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { stages_[i]->ProcessRow(input_rows, output_rows, /*xextra=*/0, rect.xsize(), rect.x0(), rect.y0() + y, thread_id); } } } void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) { std::vector& input_data = group_data_[use_group_ids_ ? group_id : thread_id]; // Copy the group borders to the border storage. for (size_t c = 0; c < input_data.size(); c++) { SaveBorders(group_id, c, input_data[c]); } size_t gy = group_id / frame_dimensions_.xsize_groups; size_t gx = group_id % frame_dimensions_.xsize_groups; if (first_image_dim_stage_ != stages_.size()) { size_t group_dim = frame_dimensions_.group_dim << base_color_shift_; RectT group_rect(gx * group_dim, gy * group_dim, group_dim, group_dim); RectT image_rect(0, 0, frame_dimensions_.xsize_upsampled, frame_dimensions_.ysize_upsampled); RectT full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0); image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); image_rect = image_rect.Intersection(full_image_rect); group_rect = group_rect.Intersection(image_rect); size_t x0 = group_rect.x0(); size_t y0 = group_rect.y0(); size_t x1 = group_rect.x1(); size_t y1 = group_rect.y1(); JXL_DEBUG_V(6, "Rendering padding for full image rect %s " "outside group rect %s", Description(full_image_rect).c_str(), Description(group_rect).c_str()); if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) { // If this frame does not intersect with the full image, we have to // initialize the whole image area with RenderPadding. RenderPadding(thread_id, Rect(0, 0, full_image_xsize_, full_image_ysize_)); } // Render padding for groups that intersect with the full image. The case // where no groups intersect was handled above. if (group_rect.xsize() > 0 && group_rect.ysize() > 0) { if (gx == 0 && gy == 0) { RenderPadding(thread_id, Rect(0, 0, x0, y0)); } if (gy == 0) { RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0)); } if (gx == 0) { RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0)); } if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) { RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1)); } if (gy + 1 == frame_dimensions_.ysize_groups) { RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1)); } if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) { RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0)); } if (gx + 1 == frame_dimensions_.xsize_groups) { RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0)); } if (gy + 1 == frame_dimensions_.ysize_groups && gx + 1 == frame_dimensions_.xsize_groups) { RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1, full_image_ysize_ - y1)); } } } Rect ready_rects[GroupBorderAssigner::kMaxToFinalize]; size_t num_ready_rects = 0; group_border_assigner_.GroupDone(group_id, group_border_.first, group_border_.second, ready_rects, &num_ready_rects); for (size_t i = 0; i < num_ready_rects; i++) { const Rect& image_max_color_channel_rect = ready_rects[i]; for (size_t c = 0; c < input_data.size(); c++) { LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]); } Rect data_max_color_channel_rect( group_data_x_border_ + image_max_color_channel_rect.x0() - gx * frame_dimensions_.group_dim, group_data_y_border_ + image_max_color_channel_rect.y0() - gy * frame_dimensions_.group_dim, image_max_color_channel_rect.xsize(), image_max_color_channel_rect.ysize()); RenderRect(thread_id, input_data, data_max_color_channel_rect, image_max_color_channel_rect); } } } // namespace jxl