diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/jpeg-xl/lib/jxl/render_pipeline | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/render_pipeline')
41 files changed, 6108 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc new file mode 100644 index 0000000000..9aefdd007d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc @@ -0,0 +1,864 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" + +#include <algorithm> +#include <queue> +#include <tuple> + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +std::pair<size_t, size_t> +LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions( + std::pair<size_t, size_t> in, size_t c, size_t stage) const { + std::pair<size_t, size_t> ret; + std::pair<size_t, size_t> shift = channel_shifts_[stage][c]; + ret.first = + ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first; + ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >> + shift.second; + return ret; +} + +std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore( + size_t c) const { + auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0); + ret.first += padding_[0][c].first; + ret.second += padding_[0][c].second; + return ret; +} + +void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c, + const ImageF& in) { + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + size_t hshift = channel_shifts_[0][c].first; + size_t vshift = channel_shifts_[0][c].second; + size_t x0 = gx * GroupInputXSize(c); + size_t x1 = std::min((gx + 1) * GroupInputXSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + size_t y0 = gy * GroupInputYSize(c); + size_t y1 = std::min((gy + 1) * GroupInputYSize(c), + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + auto borders = BorderToStore(c); + size_t borderx_write = borders.first; + size_t bordery_write = borders.second; + + if (gy > 0) { + Rect from(group_data_x_border_, group_data_y_border_, x1 - x0, + bordery_write); + Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write); + CopyImageTo(from, in, to, &borders_horizontal_[c]); + } + if (gy + 1 < frame_dimensions_.ysize_groups) { + Rect from(group_data_x_border_, + group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0, + bordery_write); + Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write); + CopyImageTo(from, in, to, &borders_horizontal_[c]); + } + if (gx > 0) { + Rect from(group_data_x_border_, group_data_y_border_, borderx_write, + y1 - y0); + Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0); + CopyImageTo(from, in, to, &borders_vertical_[c]); + } + if (gx + 1 < frame_dimensions_.xsize_groups) { + Rect from(group_data_x_border_ + x1 - x0 - borderx_write, + group_data_y_border_, borderx_write, y1 - y0); + Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0); + CopyImageTo(from, in, to, &borders_vertical_[c]); + } +} + +void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c, + const Rect& r, ImageF* out) { + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + size_t hshift = channel_shifts_[0][c].first; + size_t vshift = channel_shifts_[0][c].second; + // Coordinates of the group in the image. + size_t x0 = gx * GroupInputXSize(c); + size_t x1 = std::min((gx + 1) * GroupInputXSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + size_t y0 = gy * GroupInputYSize(c); + size_t y1 = std::min((gy + 1) * GroupInputYSize(c), + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + size_t paddingx = padding_[0][c].first; + size_t paddingy = padding_[0][c].second; + + auto borders = BorderToStore(c); + size_t borderx_write = borders.first; + size_t bordery_write = borders.second; + + // Limits of the area to copy from, in image coordinates. + JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx); + size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift); + if (x0src != 0) { + x0src -= paddingx; + } + // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the + // right side of the image, so we use min() here. + size_t x1src = + DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift); + x1src = std::min(x1src + paddingx, + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + + // Similar computation for y. + JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy); + size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift); + if (y0src != 0) { + y0src -= paddingy; + } + size_t y1src = + DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift); + y1src = std::min(y1src + paddingy, + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + // Copy other groups' borders from the border storage. + if (y0src < y0) { + JXL_DASSERT(gy > 0); + CopyImageTo( + Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write), + borders_horizontal_[c], + Rect(group_data_x_border_ + x0src - x0, + group_data_y_border_ - bordery_write, x1src - x0src, + bordery_write), + out); + } + if (y1src > y1) { + // When copying the bottom border we must not be on the bottom groups. + JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups); + CopyImageTo( + Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write), + borders_horizontal_[c], + Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0, + x1src - x0src, bordery_write), + out); + } + if (x0src < x0) { + JXL_DASSERT(gx > 0); + CopyImageTo( + Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src), + borders_vertical_[c], + Rect(group_data_x_border_ - borderx_write, + group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src), + out); + } + if (x1src > x1) { + // When copying the right border we must not be on the rightmost groups. + JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups); + CopyImageTo( + Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src), + borders_vertical_[c], + Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0, + borderx_write, y1src - y0src), + out); + } +} + +size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const { + return (frame_dimensions_.group_dim << base_color_shift_) >> + channel_shifts_[0][c].first; +} + +size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const { + return (frame_dimensions_.group_dim << base_color_shift_) >> + channel_shifts_[0][c].second; +} + +void LowMemoryRenderPipeline::EnsureBordersStorage() { + const auto& shifts = channel_shifts_[0]; + if (borders_horizontal_.size() < shifts.size()) { + borders_horizontal_.resize(shifts.size()); + borders_vertical_.resize(shifts.size()); + } + for (size_t c = 0; c < shifts.size(); c++) { + auto borders = BorderToStore(c); + size_t borderx = borders.first; + size_t bordery = borders.second; + JXL_DASSERT(frame_dimensions_.xsize_groups > 0); + size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2; + JXL_DASSERT(frame_dimensions_.ysize_groups > 0); + size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2; + size_t downsampled_xsize = + DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first); + size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded, + 1 << shifts[c].second); + Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders); + if (!SameSize(horizontal, borders_horizontal_[c])) { + borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize()); + } + Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize); + if (!SameSize(vertical, borders_vertical_[c])) { + borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize()); + } + } +} + +void LowMemoryRenderPipeline::Init() { + group_border_ = {0, 0}; + base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / + frame_dimensions_.xsize_padded); + + const auto& shifts = channel_shifts_[0]; + + // Ensure that each channel has enough many border pixels. + for (size_t c = 0; c < shifts.size(); c++) { + group_border_.first = + std::max(group_border_.first, + DivCeil(padding_[0][c].first << channel_shifts_[0][c].first, + 1 << base_color_shift_)); + group_border_.second = + std::max(group_border_.second, + DivCeil(padding_[0][c].second << channel_shifts_[0][c].second, + 1 << base_color_shift_)); + } + + // Ensure that all channels have an integer number of border pixels in the + // input. + for (size_t c = 0; c < shifts.size(); c++) { + if (channel_shifts_[0][c].first >= base_color_shift_) { + group_border_.first = + RoundUpTo(group_border_.first, + 1 << (channel_shifts_[0][c].first - base_color_shift_)); + } + if (channel_shifts_[0][c].second >= base_color_shift_) { + group_border_.second = + RoundUpTo(group_border_.second, + 1 << (channel_shifts_[0][c].second - base_color_shift_)); + } + } + // Ensure that the X border on color channels is a multiple of kBlockDim or + // the vector size (required for EPF stages). Vectors on ARM NEON are never + // wider than 4 floats, so rounding to multiples of 4 is enough. +#if JXL_ARCH_ARM + constexpr size_t kGroupXAlign = 4; +#else + constexpr size_t kGroupXAlign = 16; +#endif + group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign); + // Allocate borders in group images that are just enough for storing the + // borders to be copied in, plus any rounding to ensure alignment. + std::pair<size_t, size_t> max_border = {0, 0}; + for (size_t c = 0; c < shifts.size(); c++) { + max_border.first = std::max(BorderToStore(c).first, max_border.first); + max_border.second = std::max(BorderToStore(c).second, max_border.second); + } + group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign); + group_data_y_border_ = max_border.second; + + EnsureBordersStorage(); + group_border_assigner_.Init(frame_dimensions_); + + for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0; + first_trailing_stage_--) { + bool has_inout_c = false; + for (size_t c = 0; c < shifts.size(); c++) { + if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) == + RenderPipelineChannelMode::kInOut) { + has_inout_c = true; + } + } + if (has_inout_c) { + break; + } + } + + first_image_dim_stage_ = stages_.size(); + for (size_t i = 0; i < stages_.size(); i++) { + std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + input_sizes[c] = + std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[i][c].first), + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[i][c].second)); + } + stages_[i]->SetInputSizes(input_sizes); + if (stages_[i]->SwitchToImageDimensions()) { + // We don't allow kInOut after switching to image dimensions. + JXL_ASSERT(i >= first_trailing_stage_); + first_image_dim_stage_ = i + 1; + stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_, + &frame_origin_); + break; + } + } + for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { + if (stages_[i]->SwitchToImageDimensions()) { + JXL_UNREACHABLE("Cannot switch to image dimensions multiple times"); + } + std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + input_sizes[c] = {full_image_xsize_, full_image_ysize_}; + } + stages_[i]->SetInputSizes(input_sizes); + } + + anyc_.resize(stages_.size()); + for (size_t i = 0; i < stages_.size(); i++) { + for (size_t c = 0; c < shifts.size(); c++) { + if (stages_[i]->GetChannelMode(c) != + RenderPipelineChannelMode::kIgnored) { + anyc_[i] = c; + } + } + } + + stage_input_for_channel_ = std::vector<std::vector<int32_t>>( + stages_.size(), std::vector<int32_t>(shifts.size())); + for (size_t c = 0; c < shifts.size(); c++) { + int input = -1; + for (size_t i = 0; i < stages_.size(); i++) { + stage_input_for_channel_[i][c] = input; + if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + input = i; + } + } + } + + image_rect_.resize(stages_.size()); + for (size_t i = 0; i < stages_.size(); i++) { + size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[i][anyc_[i]].first); + size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[i][anyc_[i]].second); + image_rect_[i] = Rect(0, 0, x1, y1); + } + + virtual_ypadding_for_output_.resize(stages_.size()); + xpadding_for_output_.resize(stages_.size()); + for (size_t c = 0; c < shifts.size(); c++) { + int ypad = 0; + int xpad = 0; + for (size_t i = stages_.size(); i-- > 0;) { + if (stages_[i]->GetChannelMode(c) != + RenderPipelineChannelMode::kIgnored) { + virtual_ypadding_for_output_[i] = + std::max(ypad, virtual_ypadding_for_output_[i]); + xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]); + } + if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) + + stages_[i]->settings_.border_y) + << channel_shifts_[i][c].second; + xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) + + stages_[i]->settings_.border_x; + } + } + } +} + +void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num, + bool use_group_ids) { + const auto& shifts = channel_shifts_[0]; + use_group_ids_ = use_group_ids; + size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num; + for (size_t t = group_data_.size(); t < num_buffers; t++) { + group_data_.emplace_back(); + group_data_[t].resize(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2, + GroupInputYSize(c) + group_data_y_border_ * 2); + } + } + // TODO(veluca): avoid reallocating buffers if not needed. + stage_data_.resize(num); + size_t upsampling = 1u << base_color_shift_; + size_t group_dim = frame_dimensions_.group_dim * upsampling; + size_t padding = + 2 * group_data_x_border_ * upsampling + // maximum size of a rect + 2 * kRenderPipelineXOffset; // extra padding for processing + size_t stage_buffer_xsize = group_dim + padding; + for (size_t t = 0; t < num; t++) { + stage_data_[t].resize(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + stage_data_[t][c].resize(stages_.size()); + size_t next_y_border = 0; + for (size_t i = stages_.size(); i-- > 0;) { + if (stages_[i]->GetChannelMode(c) == + RenderPipelineChannelMode::kInOut) { + size_t stage_buffer_ysize = + 2 * next_y_border + (1 << stages_[i]->settings_.shift_y); + stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize); + next_y_border = stages_[i]->settings_.border_y; + stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize); + } + } + } + } + if (first_image_dim_stage_ != stages_.size()) { + RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); + image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Intersection(full_image_rect); + if (image_rect.xsize() == 0 || image_rect.ysize() == 0) { + image_rect = RectT<ssize_t>(0, 0, 0, 0); + } + size_t left_padding = image_rect.x0(); + size_t middle_padding = group_dim; + size_t right_padding = full_image_xsize_ - image_rect.x1(); + size_t out_of_frame_xsize = + padding + + std::max(left_padding, std::max(middle_padding, right_padding)); + out_of_frame_data_.resize(num); + for (size_t t = 0; t < num; t++) { + out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size()); + } + } +} + +std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers( + size_t group_id, size_t thread_id) { + std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size()); + const size_t gx = group_id % frame_dimensions_.xsize_groups; + const size_t gy = group_id / frame_dimensions_.xsize_groups; + for (size_t c = 0; c < channel_shifts_[0].size(); c++) { + ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c]; + ret[c].second = Rect(group_data_x_border_, group_data_y_border_, + GroupInputXSize(c), GroupInputYSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[0][c].first) - + gx * GroupInputXSize(c) + group_data_x_border_, + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[0][c].second) - + gy * GroupInputYSize(c) + group_data_y_border_); + } + return ret; +} + +namespace { + +JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) { + if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) { + return Mirror(y, image_ysize); + } + if (y + group_y0 >= image_ysize) { + // Here we know that the one mirroring step is sufficient. + return 2 * image_ysize - (y + group_y0) - 1 - group_y0; + } + return y; +} + +JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0, + ssize_t group_xsize, ssize_t image_xsize) { + if (image_xsize <= borderx) { + if (group_x0 == 0) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset - ix - 1] = + row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)]; + } + } + if (group_xsize + borderx + group_x0 >= image_xsize) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset + image_xsize + ix - group_x0] = + row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) - + group_x0]; + } + } + } else { + // Here we know that the one mirroring step is sufficient. + if (group_x0 == 0) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix]; + } + } + if (group_xsize + borderx + group_x0 >= image_xsize) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] = + row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1]; + } + } + } +} + +// Information about where the *output* of each stage is stored. +class Rows { + public: + Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages, + const Rect data_max_color_channel_rect, int group_data_x_border, + int group_data_y_border, + const std::vector<std::pair<size_t, size_t>>& group_data_shift, + size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data, + std::vector<ImageF>& input_data) { + size_t num_stages = stages.size(); + size_t num_channels = input_data.size(); + + JXL_ASSERT(thread_data.size() == num_channels); + JXL_ASSERT(group_data_shift.size() == num_channels); + +#if JXL_ENABLE_ASSERT + for (const auto& td : thread_data) { + JXL_ASSERT(td.size() == num_stages); + } +#endif + + rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels)); + + for (size_t i = 0; i < num_stages; i++) { + for (size_t c = 0; c < input_data.size(); c++) { + if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1; + rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0); + rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow(); + } + } + } + + for (size_t c = 0; c < input_data.size(); c++) { + auto channel_group_data_rect = + data_max_color_channel_rect.As<ssize_t>() + .Translate(-group_data_x_border, -group_data_y_border) + .ShiftLeft(base_color_shift) + .CeilShiftRight(group_data_shift[c]) + .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset), + group_data_y_border); + rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0); + rows_[0][c].stride = input_data[c].PixelsPerRow(); + rows_[0][c].ymod_minus_1 = -1; + } + } + + // Stage -1 refers to the input data; all other values must be nonnegative and + // refer to the data for the output of that stage. + JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const { + JXL_DASSERT(stage >= -1); + const RowInfo& info = rows_[stage + 1][c]; + return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1); + } + + private: + struct RowInfo { + // Pointer to beginning of the first row. + float* base_ptr; + // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of + // 2, which allows efficient mod computation by masking). + int ymod_minus_1; + // Number of floats per row. + size_t stride; + }; + std::vector<std::vector<RowInfo>> rows_; +}; + +} // namespace + +void LowMemoryRenderPipeline::RenderRect(size_t thread_id, + std::vector<ImageF>& input_data, + Rect data_max_color_channel_rect, + Rect image_max_color_channel_rect) { + // For each stage, the rect corresponding to the image area currently being + // processed, in the coordinates of that stage (i.e. with the scaling factor + // that that stage has). + std::vector<Rect> group_rect; + group_rect.resize(stages_.size()); + Rect image_area_rect = + image_max_color_channel_rect.ShiftLeft(base_color_shift_) + .Crop(frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + for (size_t i = 0; i < stages_.size(); i++) { + group_rect[i] = + image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]); + } + + ssize_t frame_x0 = + first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0; + ssize_t frame_y0 = + first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0; + size_t full_image_xsize = first_image_dim_stage_ == stages_.size() + ? frame_dimensions_.xsize_upsampled + : full_image_xsize_; + size_t full_image_ysize = first_image_dim_stage_ == stages_.size() + ? frame_dimensions_.ysize_upsampled + : full_image_ysize_; + + // Compute actual x-axis bounds for the current image area in the context of + // the full image this frame is part of. As the left boundary may be negative, + // we also create the x_pixels_skip value, defined as follows: + // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0; + // - full_image_x0 - x_pixels_skip is the position of the current frame area + // in the full image. + ssize_t full_image_x0 = frame_x0 + image_area_rect.x0(); + ssize_t x_pixels_skip = 0; + if (full_image_x0 < 0) { + x_pixels_skip = -full_image_x0; + full_image_x0 = 0; + } + ssize_t full_image_x1 = frame_x0 + image_area_rect.x1(); + full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize); + + // If the current image area is entirely outside of the visible image, there + // is no point in proceeding. Note: this uses the assumption that if there is + // a stage with observable effects (i.e. a kInput stage), it only appears + // after the stage that switches to image dimensions. + if (full_image_x1 <= full_image_x0) return; + + // Data structures to hold information about input/output rows and their + // buffers. + Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_, + group_data_y_border_, channel_shifts_[0], base_color_shift_, + stage_data_[thread_id], input_data); + + std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ + + 1); + for (size_t i = 0; i < first_trailing_stage_; i++) { + input_rows[i].resize(input_data.size()); + } + input_rows[first_trailing_stage_].resize(input_data.size(), + std::vector<float*>(1)); + + // Maximum possible shift is 3. + RenderPipelineStage::RowInfo output_rows(input_data.size(), + std::vector<float*>(8)); + + // Fills in input_rows and output_rows for a given y value (relative to the + // start of the group, measured in actual pixels at the appropriate vertical + // scaling factor) and a given stage, applying mirroring if necessary. This + // function is somewhat inefficient for trailing kInOut or kInput stages, + // where just filling the input row once ought to be sufficient. + auto prepare_io_rows = [&](int y, size_t i) { + ssize_t bordery = stages_[i]->settings_.border_y; + size_t shifty = stages_[i]->settings_.shift_y; + auto make_row = [&](size_t c, ssize_t iy) { + size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(), + image_rect_[i].ysize()); + input_rows[i][c][iy] = + rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c); + ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x, + group_rect[i].x0(), group_rect[i].xsize(), + image_rect_[i].xsize()); + }; + for (size_t c = 0; c < input_data.size(); c++) { + RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c); + if (mode == RenderPipelineChannelMode::kIgnored) { + continue; + } + // If we already have rows from a previous iteration, we can just shift + // the rows by 1 and insert the new one. + if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) { + for (ssize_t iy = 0; iy < 2 * bordery; iy++) { + input_rows[i][c][iy] = input_rows[i][c][iy + 1]; + } + make_row(c, bordery * 2); + } else { + input_rows[i][c].resize(2 * bordery + 1); + for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) { + make_row(c, iy); + } + } + + // If necessary, get the output buffers. + if (mode == RenderPipelineChannelMode::kInOut) { + for (size_t iy = 0; iy < (1u << shifty); iy++) { + output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c); + } + } + } + }; + + // We pretend that every stage has a vertical shift of 0, i.e. it is as tall + // as the final image. + // We call each such row a "virtual" row, because it may or may not correspond + // to an actual row of the current processing stage; actual processing happens + // when vy % (1<<vshift) == 0. + + int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(), + virtual_ypadding_for_output_.end()); + + for (int vy = -num_extra_rows; + vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) { + for (size_t i = 0; i < first_trailing_stage_; i++) { + int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i]; + + if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) { + continue; + } + + if (stage_vy < -virtual_ypadding_for_output_[i]) { + continue; + } + + int y = stage_vy >> channel_shifts_[i][anyc_[i]].second; + + ssize_t image_y = ssize_t(group_rect[i].y0()) + y; + // Do not produce rows in out-of-bounds areas. + if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) { + continue; + } + + // Get the input/output rows and potentially apply mirroring to the input. + prepare_io_rows(y, i); + + // Produce output rows. + stages_[i]->ProcessRow(input_rows[i], output_rows, + xpadding_for_output_[i], group_rect[i].xsize(), + group_rect[i].x0(), image_y, thread_id); + } + + // Process trailing stages, i.e. the final set of non-kInOut stages; they + // all have the same input buffer and no need to use any mirroring. + + int y = vy - num_extra_rows; + + for (size_t c = 0; c < input_data.size(); c++) { + // Skip pixels that are not part of the actual final image area. + input_rows[first_trailing_stage_][c][0] = + rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y, + c) + + x_pixels_skip; + } + + // Check that we are not outside of the bounds for the current rendering + // rect. Not doing so might result in overwriting some rows that have been + // written (or will be written) by other threads. + if (y < 0 || y >= ssize_t(image_area_rect.ysize())) { + continue; + } + + // Avoid running pipeline stages on pixels that are outside the full image + // area. As trailing stages have no borders, this is a free optimization + // (and may be necessary for correctness, as some stages assume coordinates + // are within bounds). + ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y; + if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) { + continue; + } + + for (size_t i = first_trailing_stage_; i < stages_.size(); i++) { + // Before the first_image_dim_stage_, coordinates are relative to the + // current frame. + size_t x0 = + i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0; + size_t y = + i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y; + stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows, + /*xextra=*/0, full_image_x1 - full_image_x0, x0, y, + thread_id); + } + } +} + +void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) { + if (rect.xsize() == 0) return; + size_t numc = channel_shifts_[0].size(); + RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1)); + RenderPipelineStage::RowInfo output_rows; + + for (size_t c = 0; c < numc; c++) { + input_rows[c][0] = out_of_frame_data_[thread_id].Row(c); + } + + for (size_t y = 0; y < rect.ysize(); y++) { + stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow( + input_rows, rect.xsize(), rect.x0(), rect.y0() + y); + for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { + stages_[i]->ProcessRow(input_rows, output_rows, + /*xextra=*/0, rect.xsize(), rect.x0(), + rect.y0() + y, thread_id); + } + } +} + +void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id, + size_t thread_id) { + std::vector<ImageF>& input_data = + group_data_[use_group_ids_ ? group_id : thread_id]; + + // Copy the group borders to the border storage. + for (size_t c = 0; c < input_data.size(); c++) { + SaveBorders(group_id, c, input_data[c]); + } + + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + + if (first_image_dim_stage_ != stages_.size()) { + size_t group_dim = frame_dimensions_.group_dim << base_color_shift_; + RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim, + group_dim); + RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); + group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Intersection(full_image_rect); + group_rect = group_rect.Intersection(image_rect); + size_t x0 = group_rect.x0(); + size_t y0 = group_rect.y0(); + size_t x1 = group_rect.x1(); + size_t y1 = group_rect.y1(); + JXL_DEBUG_V(6, + "Rendering padding for full image rect %s " + "outside group rect %s", + Description(full_image_rect).c_str(), + Description(group_rect).c_str()); + + if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) { + // If this frame does not intersect with the full image, we have to + // initialize the whole image area with RenderPadding. + RenderPadding(thread_id, + Rect(0, 0, full_image_xsize_, full_image_ysize_)); + } + + // Render padding for groups that intersect with the full image. The case + // where no groups intersect was handled above. + if (group_rect.xsize() > 0 && group_rect.ysize() > 0) { + if (gx == 0 && gy == 0) { + RenderPadding(thread_id, Rect(0, 0, x0, y0)); + } + if (gy == 0) { + RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0)); + } + if (gx == 0) { + RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0)); + } + if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) { + RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1)); + } + if (gy + 1 == frame_dimensions_.ysize_groups) { + RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1)); + } + if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0)); + } + if (gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0)); + } + if (gy + 1 == frame_dimensions_.ysize_groups && + gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1, + full_image_ysize_ - y1)); + } + } + } + + Rect ready_rects[GroupBorderAssigner::kMaxToFinalize]; + size_t num_ready_rects = 0; + group_border_assigner_.GroupDone(group_id, group_border_.first, + group_border_.second, ready_rects, + &num_ready_rects); + for (size_t i = 0; i < num_ready_rects; i++) { + const Rect& image_max_color_channel_rect = ready_rects[i]; + for (size_t c = 0; c < input_data.size(); c++) { + LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]); + } + Rect data_max_color_channel_rect( + group_data_x_border_ + image_max_color_channel_rect.x0() - + gx * frame_dimensions_.group_dim, + group_data_y_border_ + image_max_color_channel_rect.y0() - + gy * frame_dimensions_.group_dim, + image_max_color_channel_rect.xsize(), + image_max_color_channel_rect.ysize()); + RenderRect(thread_id, input_data, data_max_color_channel_rect, + image_max_color_channel_rect); + } +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h new file mode 100644 index 0000000000..b386f7c078 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h @@ -0,0 +1,111 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_ +#define LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_ + +#include <stdint.h> + +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/render_pipeline/render_pipeline.h" + +namespace jxl { + +// A multithreaded, low-memory rendering pipeline that only allocates a minimal +// amount of buffers. +class LowMemoryRenderPipeline final : public RenderPipeline { + private: + std::vector<std::pair<ImageF*, Rect>> PrepareBuffers( + size_t group_id, size_t thread_id) override; + + void PrepareForThreadsInternal(size_t num, bool use_group_ids) override; + + void ProcessBuffers(size_t group_id, size_t thread_id) override; + + void ClearDone(size_t i) override { group_border_assigner_.ClearDone(i); } + + void Init() override; + + void EnsureBordersStorage(); + size_t GroupInputXSize(size_t c) const; + size_t GroupInputYSize(size_t c) const; + void RenderRect(size_t thread_id, std::vector<ImageF>& input_data, + Rect data_max_color_channel_rect, + Rect image_max_color_channel_rect); + void RenderPadding(size_t thread_id, Rect rect); + + void SaveBorders(size_t group_id, size_t c, const ImageF& in); + void LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out); + + std::pair<size_t, size_t> ColorDimensionsToChannelDimensions( + std::pair<size_t, size_t> in, size_t c, size_t stage) const; + + std::pair<size_t, size_t> BorderToStore(size_t c) const; + + bool use_group_ids_; + + // Storage for borders between groups. Borders of adjacent groups are stacked + // together, e.g. bottom border of current group is followed by top border + // of next group. + std::vector<ImageF> borders_horizontal_; + std::vector<ImageF> borders_vertical_; + + // Manages the status of borders. + GroupBorderAssigner group_border_assigner_; + + // Size (in color-channel-pixels) of the border around each group that might + // be assigned to that group. + std::pair<size_t, size_t> group_border_; + // base_color_shift_ defines the size of groups in terms of final image + // pixels. + size_t base_color_shift_; + + // Buffer for decoded pixel data for a group, indexed by [thread][channel] or + // [group][channel] depending on `use_group_ids_`. + std::vector<std::vector<ImageF>> group_data_; + + // Borders for storing group data. + size_t group_data_x_border_; + size_t group_data_y_border_; + + // Buffers for intermediate rows for the various stages, indexed by + // [thread][channel][stage]. + std::vector<std::vector<std::vector<ImageF>>> stage_data_; + + // Buffers for out-of-frame data, indexed by [thread]; every row is a + // different channel. + std::vector<ImageF> out_of_frame_data_; + + // For each stage, a non-kIgnored channel. + std::vector<int32_t> anyc_; + + // Size of the image at each stage. + std::vector<Rect> image_rect_; + + // For each stage, for each channel, keep track of the kInOut stage that + // produced the input to that stage (which corresponds to the buffer index + // containing the data). -1 if data comes from the original input. + std::vector<std::vector<int32_t>> stage_input_for_channel_; + + // Number of (virtual) extra rows that must be processed at each stage + // to produce sufficient output for future stages. + std::vector<int> virtual_ypadding_for_output_; + + // Same thing for columns, except these are real columns and not virtual ones. + std::vector<int> xpadding_for_output_; + + // First stage that doesn't have any kInOut channel. + size_t first_trailing_stage_; + + // Origin and size of the frame after switching to image dimensions. + FrameOrigin frame_origin_; + size_t full_image_xsize_; + size_t full_image_ysize_; + size_t first_image_dim_stage_; +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc new file mode 100644 index 0000000000..68b6ef613f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc @@ -0,0 +1,132 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/render_pipeline.h" + +#include <algorithm> + +#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" +#include "lib/jxl/render_pipeline/simple_render_pipeline.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +void RenderPipeline::Builder::AddStage( + std::unique_ptr<RenderPipelineStage> stage) { + stages_.push_back(std::move(stage)); +} + +std::unique_ptr<RenderPipeline> RenderPipeline::Builder::Finalize( + FrameDimensions frame_dimensions) && { +#if JXL_ENABLE_ASSERT + // Check that the last stage is not an kInOut stage for any channel, and that + // there is at least one stage. + JXL_ASSERT(!stages_.empty()); + for (size_t c = 0; c < num_c_; c++) { + JXL_ASSERT(stages_.back()->GetChannelMode(c) != + RenderPipelineChannelMode::kInOut); + } +#endif + + std::unique_ptr<RenderPipeline> res; + if (use_simple_implementation_) { + res = jxl::make_unique<SimpleRenderPipeline>(); + } else { + res = jxl::make_unique<LowMemoryRenderPipeline>(); + } + + res->padding_.resize(stages_.size()); + for (size_t i = stages_.size(); i-- > 0;) { + const auto& stage = stages_[i]; + res->padding_[i].resize(num_c_); + if (i + 1 == stages_.size()) { + continue; + } + for (size_t c = 0; c < num_c_; c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + res->padding_[i][c].first = DivCeil(res->padding_[i + 1][c].first, + 1 << stage->settings_.shift_x) + + stage->settings_.border_x; + res->padding_[i][c].second = DivCeil(res->padding_[i + 1][c].second, + 1 << stage->settings_.shift_y) + + stage->settings_.border_y; + } else { + res->padding_[i][c] = res->padding_[i + 1][c]; + } + } + } + + res->frame_dimensions_ = frame_dimensions; + res->group_completed_passes_.resize(frame_dimensions.num_groups); + res->channel_shifts_.resize(stages_.size()); + res->channel_shifts_[0].resize(num_c_); + for (size_t i = 1; i < stages_.size(); i++) { + auto& stage = stages_[i - 1]; + for (size_t c = 0; c < num_c_; c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + res->channel_shifts_[0][c].first += stage->settings_.shift_x; + res->channel_shifts_[0][c].second += stage->settings_.shift_y; + } + } + } + for (size_t i = 1; i < stages_.size(); i++) { + auto& stage = stages_[i - 1]; + res->channel_shifts_[i].resize(num_c_); + for (size_t c = 0; c < num_c_; c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + res->channel_shifts_[i][c].first = + res->channel_shifts_[i - 1][c].first - stage->settings_.shift_x; + res->channel_shifts_[i][c].second = + res->channel_shifts_[i - 1][c].second - stage->settings_.shift_y; + } else { + res->channel_shifts_[i][c].first = res->channel_shifts_[i - 1][c].first; + res->channel_shifts_[i][c].second = + res->channel_shifts_[i - 1][c].second; + } + } + } + res->stages_ = std::move(stages_); + res->Init(); + return res; +} + +RenderPipelineInput RenderPipeline::GetInputBuffers(size_t group_id, + size_t thread_id) { + RenderPipelineInput ret; + JXL_DASSERT(group_id < group_completed_passes_.size()); + ret.group_id_ = group_id; + ret.thread_id_ = thread_id; + ret.pipeline_ = this; + ret.buffers_ = PrepareBuffers(group_id, thread_id); + return ret; +} + +void RenderPipeline::InputReady( + size_t group_id, size_t thread_id, + const std::vector<std::pair<ImageF*, Rect>>& buffers) { + JXL_DASSERT(group_id < group_completed_passes_.size()); + group_completed_passes_[group_id]++; + for (size_t i = 0; i < buffers.size(); ++i) { + (void)i; + JXL_CHECK_PLANE_INITIALIZED(*buffers[i].first, buffers[i].second, i); + } + + ProcessBuffers(group_id, thread_id); +} + +Status RenderPipeline::PrepareForThreads(size_t num, bool use_group_ids) { + for (const auto& stage : stages_) { + JXL_RETURN_IF_ERROR(stage->PrepareForThreads(num)); + } + PrepareForThreadsInternal(num, use_group_ids); + return true; +} + +void RenderPipelineInput::Done() { + JXL_ASSERT(pipeline_); + pipeline_->InputReady(group_id_, thread_id_, buffers_); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h new file mode 100644 index 0000000000..bf3ad4975e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h @@ -0,0 +1,139 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_ +#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_ + +#include <stdint.h> + +#include "lib/jxl/image.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Interface to provide input to the rendering pipeline. When this object is +// destroyed, all the data in the provided ImageF's Rects must have been +// initialized. +class RenderPipelineInput { + public: + RenderPipelineInput(const RenderPipelineInput&) = delete; + RenderPipelineInput(RenderPipelineInput&& other) noexcept { + *this = std::move(other); + } + RenderPipelineInput& operator=(RenderPipelineInput&& other) noexcept { + pipeline_ = other.pipeline_; + group_id_ = other.group_id_; + thread_id_ = other.thread_id_; + buffers_ = std::move(other.buffers_); + other.pipeline_ = nullptr; + return *this; + } + + RenderPipelineInput() = default; + void Done(); + + const std::pair<ImageF*, Rect>& GetBuffer(size_t c) const { + JXL_ASSERT(c < buffers_.size()); + return buffers_[c]; + } + + private: + RenderPipeline* pipeline_ = nullptr; + size_t group_id_; + size_t thread_id_; + std::vector<std::pair<ImageF*, Rect>> buffers_; + friend class RenderPipeline; +}; + +class RenderPipeline { + public: + class Builder { + public: + explicit Builder(size_t num_c) : num_c_(num_c) { JXL_ASSERT(num_c > 0); } + + // Adds a stage to the pipeline. Must be called at least once; the last + // added stage cannot have kInOut channels. + void AddStage(std::unique_ptr<RenderPipelineStage> stage); + + // Enables using the simple (i.e. non-memory-efficient) implementation of + // the pipeline. + void UseSimpleImplementation() { use_simple_implementation_ = true; } + + // Finalizes setup of the pipeline. Shifts for all channels should be 0 at + // this point. + std::unique_ptr<RenderPipeline> Finalize( + FrameDimensions frame_dimensions) &&; + + private: + std::vector<std::unique_ptr<RenderPipelineStage>> stages_; + size_t num_c_; + bool use_simple_implementation_ = false; + }; + + friend class Builder; + + virtual ~RenderPipeline() = default; + + Status IsInitialized() const { + for (const auto& stage : stages_) { + JXL_RETURN_IF_ERROR(stage->IsInitialized()); + } + return true; + } + + // Allocates storage to run with `num` threads. If `use_group_ids` is true, + // storage is allocated for each group, not each thread. The behaviour is + // undefined if calling this function multiple times with a different value + // for `use_group_ids`. + Status PrepareForThreads(size_t num, bool use_group_ids); + + // Retrieves a buffer where input data should be stored by the callee. When + // input has been provided for all buffers, the pipeline will complete its + // processing. This method may be called multiple times concurrently from + // different threads, provided that a different `thread_id` is given. + RenderPipelineInput GetInputBuffers(size_t group_id, size_t thread_id); + + size_t PassesWithAllInput() const { + return *std::min_element(group_completed_passes_.begin(), + group_completed_passes_.end()); + } + + virtual void ClearDone(size_t i) {} + + protected: + std::vector<std::unique_ptr<RenderPipelineStage>> stages_; + // Shifts for every channel at the input of each stage. + std::vector<std::vector<std::pair<size_t, size_t>>> channel_shifts_; + + // Amount of (cumulative) padding required by each stage and channel, in + // either direction. + std::vector<std::vector<std::pair<size_t, size_t>>> padding_; + + FrameDimensions frame_dimensions_; + + std::vector<uint8_t> group_completed_passes_; + + friend class RenderPipelineInput; + + private: + void InputReady(size_t group_id, size_t thread_id, + const std::vector<std::pair<ImageF*, Rect>>& buffers); + + virtual std::vector<std::pair<ImageF*, Rect>> PrepareBuffers( + size_t group_id, size_t thread_id) = 0; + + virtual void ProcessBuffers(size_t group_id, size_t thread_id) = 0; + + // Note that this method may be called multiple times with different (or + // equal) `num`. + virtual void PrepareForThreadsInternal(size_t num, bool use_group_ids) = 0; + + // Called once frame dimensions and stages are known. + virtual void Init() {} +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h new file mode 100644 index 0000000000..d1a0074161 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h @@ -0,0 +1,171 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_ +#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_ + +#include <stdint.h> + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/frame_header.h" + +namespace jxl { + +// The first pixel in the input to RenderPipelineStage will be located at +// this position. Pixels before this position may be accessed as padding. +// This should be at least the RoundUpTo(maximum padding / 2, maximum vector +// size) times 2: this is realized when using Gaborish + EPF + upsampling + +// chroma subsampling. +#if JXL_ARCH_ARM +constexpr size_t kRenderPipelineXOffset = 16; +#else +constexpr size_t kRenderPipelineXOffset = 32; +#endif + +enum class RenderPipelineChannelMode { + // This channel is not modified by this stage. + kIgnored = 0, + // This channel is modified in-place. + kInPlace = 1, + // This channel is modified and written to a new buffer. + kInOut = 2, + // This channel is only read. These are the only stages that are assumed to + // have observable effects, i.e. calls to ProcessRow for other stages may be + // omitted if it can be shown they can't affect any kInput stage ProcessRow + // call that happens inside image boundaries. + kInput = 3, +}; + +class RenderPipeline; + +class RenderPipelineStage { + protected: + using Row = float*; + using ChannelRows = std::vector<Row>; + + public: + using RowInfo = std::vector<ChannelRows>; + struct Settings { + // Amount of padding required in the various directions by all channels + // that have kInOut mode. + size_t border_x = 0; + size_t border_y = 0; + + // Log2 of the number of columns/rows of output that this stage will produce + // for every input row for kInOut channels. + size_t shift_x = 0; + size_t shift_y = 0; + + static Settings ShiftX(size_t shift, size_t border) { + Settings settings; + settings.border_x = border; + settings.shift_x = shift; + return settings; + } + + static Settings ShiftY(size_t shift, size_t border) { + Settings settings; + settings.border_y = border; + settings.shift_y = shift; + return settings; + } + + static Settings Symmetric(size_t shift, size_t border) { + Settings settings; + settings.border_x = settings.border_y = border; + settings.shift_x = settings.shift_y = shift; + return settings; + } + + static Settings SymmetricBorderOnly(size_t border) { + return Symmetric(0, border); + } + }; + + virtual ~RenderPipelineStage() = default; + + // Processes one row of input, producing the appropriate number of rows of + // output. Input/output rows can be obtained by calls to + // `GetInputRow`/`GetOutputRow`. `xsize+2*xextra` represents the total number + // of pixels to be processed in the input row, where the first pixel is at + // position `kRenderPipelineXOffset-xextra`. All pixels in the + // `[kRenderPipelineXOffset-xextra-border_x, + // kRenderPipelineXOffset+xsize+xextra+border_x)` range are initialized and + // accessible. `xpos` and `ypos` represent the position of the first + // (non-extra, i.e. in position kRenderPipelineXOffset) pixel in the center + // row of the input in the full image. `xpos` is a multiple of + // `GroupBorderAssigner::kPaddingXRound`. If `settings_.temp_buffer_size` is + // nonzero, `temp` will point to an HWY-aligned buffer of at least that number + // of floats; concurrent calls will have different buffers. + virtual void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const = 0; + + // How each channel will be processed. Channels are numbered starting from + // color channels (always 3) and followed by all other channels. + virtual RenderPipelineChannelMode GetChannelMode(size_t c) const = 0; + + protected: + explicit RenderPipelineStage(Settings settings) : settings_(settings) {} + + virtual Status IsInitialized() const { return true; } + + // Informs the stage about the total size of each channel. Few stages will + // actually need to use this information. + virtual void SetInputSizes( + const std::vector<std::pair<size_t, size_t>>& input_sizes) {} + + virtual Status PrepareForThreads(size_t num_threads) { return true; } + + // Returns a pointer to the input row of channel `c` with offset `y`. + // `y` must be in [-settings_.border_y, settings_.border_y]. `c` must be such + // that `GetChannelMode(c) != kIgnored`. The returned pointer points to the + // offset-ed row (i.e. kRenderPipelineXOffset has been applied). + float* GetInputRow(const RowInfo& input_rows, size_t c, int offset) const { + JXL_DASSERT(GetChannelMode(c) != RenderPipelineChannelMode::kIgnored); + JXL_DASSERT(-offset <= static_cast<int>(settings_.border_y)); + JXL_DASSERT(offset <= static_cast<int>(settings_.border_y)); + return input_rows[c][settings_.border_y + offset] + kRenderPipelineXOffset; + } + // Similar to `GetInputRow`, but can only be used if `GetChannelMode(c) == + // kInOut`. Offset must be less than `1<<settings_.shift_y`.. The returned + // pointer points to the offset-ed row (i.e. kRenderPipelineXOffset has been + // applied). + float* GetOutputRow(const RowInfo& output_rows, size_t c, + size_t offset) const { + JXL_DASSERT(GetChannelMode(c) == RenderPipelineChannelMode::kInOut); + JXL_DASSERT(offset <= 1ul << settings_.shift_y); + return output_rows[c][offset] + kRenderPipelineXOffset; + } + + // Indicates whether, from this stage on, the pipeline will operate on an + // image- rather than frame-sized buffer. Only one stage in the pipeline + // should return true, and it should implement ProcessPaddingRow below too. + // It is assumed that, if there is a SwitchToImageDimensions() == true stage, + // all kInput stages appear after it. + virtual bool SwitchToImageDimensions() const { return false; } + + // If SwitchToImageDimensions returns true, then this should set xsize and + // ysize to the image size, and frame_origin to the location of the frame + // within the image. Otherwise, this is not called at all. + virtual void GetImageDimensions(size_t* xsize, size_t* ysize, + FrameOrigin* frame_origin) const {} + + // Produces the appropriate output data outside of the frame dimensions. xpos + // and ypos are now relative to the full image. + virtual void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, + size_t xpos, size_t ypos) const {} + + virtual const char* GetName() const = 0; + + Settings settings_; + friend class RenderPipeline; + friend class SimpleRenderPipeline; + friend class LowMemoryRenderPipeline; +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc new file mode 100644 index 0000000000..51b9f273f8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc @@ -0,0 +1,579 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/render_pipeline.h" + +#include <jxl/cms.h> + +#include <algorithm> +#include <cctype> +#include <cstdint> +#include <cstdio> +#include <ostream> +#include <sstream> +#include <string> +#include <utility> +#include <vector> + +#include "lib/extras/codec.h" +#include "lib/jxl/base/common.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" // JXL_HIGH_PRECISION, JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/fake_parallel_runner_testonly.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_dimensions.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/render_pipeline/test_render_pipeline_stages.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +Status DecodeFile(const Span<const uint8_t> file, bool use_slow_pipeline, + CodecInOut* io, ThreadPool* pool) { + Status ret = true; + { + BitReader reader(file); + BitReaderScopedCloser reader_closer(&reader, &ret); + JXL_RETURN_IF_ERROR(reader.ReadFixedBits<16>() == 0x0AFF); + JXL_RETURN_IF_ERROR(ReadSizeHeader(&reader, &io->metadata.size)); + JXL_RETURN_IF_ERROR(ReadImageMetadata(&reader, &io->metadata.m)); + io->metadata.transform_data.nonserialized_xyb_encoded = + io->metadata.m.xyb_encoded; + JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data)); + if (io->metadata.m.color_encoding.WantICC()) { + std::vector<uint8_t> icc; + JXL_RETURN_IF_ERROR(test::ReadICC(&reader, &icc)); + JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC( + std::move(icc), JxlGetDefaultCms())); + } + PassesDecoderState dec_state; + JXL_RETURN_IF_ERROR( + dec_state.output_encoding_info.SetFromMetadata(io->metadata)); + JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary()); + io->frames.clear(); + FrameHeader frame_header(&io->metadata); + do { + io->frames.emplace_back(&io->metadata.m); + // Skip frames that are not displayed. + do { + size_t frame_start = reader.TotalBitsConsumed() / kBitsPerByte; + size_t size_left = file.size() - frame_start; + JXL_RETURN_IF_ERROR(DecodeFrame(&dec_state, pool, + file.data() + frame_start, size_left, + &frame_header, &io->frames.back(), + io->metadata, use_slow_pipeline)); + reader.SkipBits(io->frames.back().decoded_bytes() * kBitsPerByte); + } while (frame_header.frame_type != FrameType::kRegularFrame && + frame_header.frame_type != FrameType::kSkipProgressive); + } while (!frame_header.is_last); + + if (io->frames.empty()) return JXL_FAILURE("Not enough data."); + + if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) { + return JXL_FAILURE("Reader position not at EOF."); + } + if (!reader.AllReadsWithinBounds()) { + return JXL_FAILURE("Reader out of bounds read."); + } + io->CheckMetadata(); + // reader is closed here. + } + return ret; +} + +TEST(RenderPipelineTest, Build) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique<UpsampleXSlowStage>()); + builder.AddStage(jxl::make_unique<UpsampleYSlowStage>()); + builder.AddStage(jxl::make_unique<Check0FinalStage>()); + builder.UseSimpleImplementation(); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + std::move(builder).Finalize(frame_dimensions); +} + +TEST(RenderPipelineTest, CallAllGroups) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique<UpsampleXSlowStage>()); + builder.AddStage(jxl::make_unique<UpsampleYSlowStage>()); + builder.AddStage(jxl::make_unique<Check0FinalStage>()); + builder.UseSimpleImplementation(); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + auto pipeline = std::move(builder).Finalize(frame_dimensions); + ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false)); + + for (size_t i = 0; i < frame_dimensions.num_groups; i++) { + auto input_buffers = pipeline->GetInputBuffers(i, 0); + FillPlane(0.0f, input_buffers.GetBuffer(0).first, + input_buffers.GetBuffer(0).second); + input_buffers.Done(); + } + + EXPECT_EQ(pipeline->PassesWithAllInput(), 1); +} + +TEST(RenderPipelineTest, BuildFast) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique<UpsampleXSlowStage>()); + builder.AddStage(jxl::make_unique<UpsampleYSlowStage>()); + builder.AddStage(jxl::make_unique<Check0FinalStage>()); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + std::move(builder).Finalize(frame_dimensions); +} + +TEST(RenderPipelineTest, CallAllGroupsFast) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique<UpsampleXSlowStage>()); + builder.AddStage(jxl::make_unique<UpsampleYSlowStage>()); + builder.AddStage(jxl::make_unique<Check0FinalStage>()); + builder.UseSimpleImplementation(); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + auto pipeline = std::move(builder).Finalize(frame_dimensions); + ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false)); + + for (size_t i = 0; i < frame_dimensions.num_groups; i++) { + auto input_buffers = pipeline->GetInputBuffers(i, 0); + FillPlane(0.0f, input_buffers.GetBuffer(0).first, + input_buffers.GetBuffer(0).second); + input_buffers.Done(); + } + + EXPECT_EQ(pipeline->PassesWithAllInput(), 1); +} + +struct RenderPipelineTestInputSettings { + // Input image. + std::string input_path; + size_t xsize, ysize; + bool jpeg_transcode = false; + // Encoding settings. + CompressParams cparams; + // Short name for the encoder settings. + std::string cparams_descr; + + bool add_spot_color = false; + + Splines splines; +}; + +class RenderPipelineTestParam + : public ::testing::TestWithParam<RenderPipelineTestInputSettings> {}; + +TEST_P(RenderPipelineTestParam, PipelineTest) { + RenderPipelineTestInputSettings config = GetParam(); + + // Use a parallel runner that randomly shuffles tasks to detect possible + // border handling bugs. + FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8); + ThreadPool pool(&JxlFakeParallelRunner, &fake_pool); + const std::vector<uint8_t> orig = jxl::test::ReadTestData(config.input_path); + + CodecInOut io; + if (config.jpeg_transcode) { + ASSERT_TRUE(jpeg::DecodeImageJPG(Bytes(orig), &io)); + } else { + ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool)); + } + io.ShrinkTo(config.xsize, config.ysize); + + if (config.add_spot_color) { + jxl::ImageF spot(config.xsize, config.ysize); + jxl::ZeroFillImage(&spot); + + for (size_t y = 0; y < config.ysize; y++) { + float* JXL_RESTRICT row = spot.Row(y); + for (size_t x = 0; x < config.xsize; x++) { + row[x] = ((x ^ y) & 255) * (1.f / 255.f); + } + } + ExtraChannelInfo info; + info.bit_depth.bits_per_sample = 8; + info.dim_shift = 0; + info.type = jxl::ExtraChannel::kSpotColor; + info.spot_color[0] = 0.5f; + info.spot_color[1] = 0.2f; + info.spot_color[2] = 1.f; + info.spot_color[3] = 0.5f; + + io.metadata.m.extra_channel_info.push_back(info); + std::vector<jxl::ImageF> ec; + ec.push_back(std::move(spot)); + io.frames[0].SetExtraChannels(std::move(ec)); + } + + std::vector<uint8_t> compressed; + + config.cparams.custom_splines = config.splines; + ASSERT_TRUE(test::EncodeFile(config.cparams, &io, &compressed, &pool)); + + CodecInOut io_default; + ASSERT_TRUE(DecodeFile(Bytes(compressed), + /*use_slow_pipeline=*/false, &io_default, &pool)); + CodecInOut io_slow_pipeline; + ASSERT_TRUE(DecodeFile(Bytes(compressed), + /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool)); + + ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size()); + for (size_t i = 0; i < io_default.frames.size(); i++) { +#if JXL_HIGH_PRECISION + constexpr float kMaxError = 5e-5; +#else + constexpr float kMaxError = 5e-4; +#endif + Image3F def = std::move(*io_default.frames[i].color()); + Image3F pip = std::move(*io_slow_pipeline.frames[i].color()); + JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _)); + for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size(); + ec++) { + JXL_ASSERT_OK(VerifyRelativeError( + io_slow_pipeline.frames[i].extra_channels()[ec], + io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _)); + } + } +} + +Splines CreateTestSplines() { + const ColorCorrelationMap cmap; + std::vector<Spline::Point> control_points{{9, 54}, {118, 159}, {97, 3}, + {10, 40}, {150, 25}, {120, 300}}; + const Spline spline{ + control_points, + /*color_dct=*/ + {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}}, + /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}; + std::vector<Spline> spline_data = {spline}; + std::vector<QuantizedSpline> quantized_splines; + std::vector<Spline::Point> starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, /*quantization_adjustment=*/0, + cmap.YtoXRatio(0), cmap.YtoBRatio(0)); + starting_points.push_back(spline.control_points.front()); + } + return Splines(/*quantization_adjustment=*/0, std::move(quantized_splines), + std::move(starting_points)); +} + +std::vector<RenderPipelineTestInputSettings> GeneratePipelineTests() { + std::vector<RenderPipelineTestInputSettings> all_tests; + + std::pair<size_t, size_t> sizes[] = { + {3, 8}, {128, 128}, {256, 256}, {258, 258}, {533, 401}, {777, 777}, + }; + + for (auto size : sizes) { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/flower/flower.png"; + settings.xsize = size.first; + settings.ysize = size.second; + + // Base settings. + settings.cparams.butteraugli_distance = 1.0; + settings.cparams.patches = Override::kOff; + settings.cparams.dots = Override::kOff; + settings.cparams.gaborish = Override::kOff; + settings.cparams.epf = 0; + settings.cparams.color_transform = ColorTransform::kXYB; + + { + auto s = settings; + s.cparams_descr = "NoGabNoEpfNoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.color_transform = ColorTransform::kNone; + s.cparams_descr = "NoGabNoEpfNoPatchesNoXYB"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.gaborish = Override::kOn; + s.cparams_descr = "GabNoEpfNoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.epf = 1; + s.cparams_descr = "NoGabEpf1NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.epf = 2; + s.cparams_descr = "NoGabEpf2NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.epf = 3; + s.cparams_descr = "NoGabEpf3NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.gaborish = Override::kOn; + s.cparams.epf = 3; + s.cparams_descr = "GabEpf3NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "Splines"; + s.splines = CreateTestSplines(); + all_tests.push_back(s); + } + + for (size_t ups : {2, 4, 8}) { + { + auto s = settings; + s.cparams.resampling = ups; + s.cparams_descr = "Ups" + std::to_string(ups); + all_tests.push_back(s); + } + { + auto s = settings; + s.cparams.resampling = ups; + s.cparams.epf = 1; + s.cparams_descr = "Ups" + std::to_string(ups) + "EPF1"; + all_tests.push_back(s); + } + { + auto s = settings; + s.cparams.resampling = ups; + s.cparams.gaborish = Override::kOn; + s.cparams.epf = 1; + s.cparams_descr = "Ups" + std::to_string(ups) + "GabEPF1"; + all_tests.push_back(s); + } + } + + { + auto s = settings; + s.cparams_descr = "Noise"; + s.cparams.photon_noise_iso = 3200; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "NoiseUps"; + s.cparams.photon_noise_iso = 3200; + s.cparams.resampling = 2; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "ModularLossless"; + s.cparams.modular_mode = true; + s.cparams.butteraugli_distance = 0; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "ProgressiveDC"; + s.cparams.progressive_dc = 1; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "ModularLossy"; + s.cparams.modular_mode = true; + s.cparams.butteraugli_distance = 1.f; + all_tests.push_back(s); + } + + { + auto s = settings; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaVarDCT"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaVarDCTUpsamplingEPF"; + s.cparams.epf = 1; + s.cparams.ec_resampling = 2; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.modular_mode = true; + s.cparams.butteraugli_distance = 0; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaLossless"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaDownsample"; + s.cparams.ec_resampling = 2; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "SpotColor"; + s.add_spot_color = true; + all_tests.push_back(s); + } + } + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + for (const char* input : {"jxl/flower/flower.png.im_q85_444.jpg", + "jxl/flower/flower.png.im_q85_420.jpg", + "jxl/flower/flower.png.im_q85_422.jpg", + "jxl/flower/flower.png.im_q85_440.jpg"}) { + RenderPipelineTestInputSettings settings; + settings.input_path = input; + settings.jpeg_transcode = true; + settings.xsize = 2268; + settings.ysize = 1512; + settings.cparams_descr = "Default"; + all_tests.push_back(settings); + } + +#endif + + { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/grayscale_patches.png"; + settings.xsize = 1011; + settings.ysize = 277; + settings.cparams_descr = "Patches"; + all_tests.push_back(settings); + } + + { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/grayscale_patches.png"; + settings.xsize = 1011; + settings.ysize = 277; + settings.cparams.photon_noise_iso = 1000; + settings.cparams_descr = "PatchesAndNoise"; + all_tests.push_back(settings); + } + + { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/grayscale_patches.png"; + settings.xsize = 1011; + settings.ysize = 277; + settings.cparams.resampling = 2; + settings.cparams_descr = "PatchesAndUps2"; + all_tests.push_back(settings); + } + + return all_tests; +} + +std::ostream& operator<<(std::ostream& os, + const RenderPipelineTestInputSettings& c) { + std::string filename; + size_t pos = c.input_path.find_last_of('/'); + if (pos == std::string::npos) { + filename = c.input_path; + } else { + filename = c.input_path.substr(pos + 1); + } + std::replace_if( + filename.begin(), filename.end(), [](char c) { return !isalnum(c); }, + '_'); + os << filename << "_" << (c.jpeg_transcode ? "JPEG_" : "") << c.xsize << "x" + << c.ysize << "_" << c.cparams_descr; + return os; +} + +std::string PipelineTestDescription( + const testing::TestParamInfo<RenderPipelineTestParam::ParamType>& info) { + std::stringstream name; + name << info.param; + return name.str(); +} + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RenderPipelineTest, RenderPipelineTestParam, + testing::ValuesIn(GeneratePipelineTests()), + PipelineTestDescription); + +TEST(RenderPipelineDecodingTest, Animation) { + FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8); + ThreadPool pool(&JxlFakeParallelRunner, &fake_pool); + + std::vector<uint8_t> compressed = + jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl"); + + CodecInOut io_default; + ASSERT_TRUE(DecodeFile(Bytes(compressed), + /*use_slow_pipeline=*/false, &io_default, &pool)); + CodecInOut io_slow_pipeline; + ASSERT_TRUE(DecodeFile(Bytes(compressed), + /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool)); + + ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size()); + for (size_t i = 0; i < io_default.frames.size(); i++) { +#if JXL_HIGH_PRECISION + constexpr float kMaxError = 1e-5; +#else + constexpr float kMaxError = 1e-4; +#endif + + Image3F fast_pipeline = std::move(*io_default.frames[i].color()); + Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color()); + JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError, + kMaxError, _)) + for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size(); + ec++) { + JXL_ASSERT_OK(VerifyRelativeError( + io_slow_pipeline.frames[i].extra_channels()[ec], + io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _)); + } + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc new file mode 100644 index 0000000000..4495288860 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc @@ -0,0 +1,266 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/simple_render_pipeline.h" + +#include <hwy/base.h> + +#include "lib/jxl/image_ops.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +void SimpleRenderPipeline::PrepareForThreadsInternal(size_t num, + bool use_group_ids) { + if (!channel_data_.empty()) { + return; + } + auto ch_size = [](size_t frame_size, size_t shift) { + return DivCeil(frame_size, 1 << shift) + kRenderPipelineXOffset * 2; + }; + for (size_t c = 0; c < channel_shifts_[0].size(); c++) { + channel_data_.push_back(ImageF( + ch_size(frame_dimensions_.xsize_upsampled, channel_shifts_[0][c].first), + ch_size(frame_dimensions_.ysize_upsampled, + channel_shifts_[0][c].second))); + msan::PoisonImage(channel_data_.back()); + } +} + +Rect SimpleRenderPipeline::MakeChannelRect(size_t group_id, size_t channel) { + size_t base_color_shift = + CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / + frame_dimensions_.xsize_padded); + + const size_t gx = group_id % frame_dimensions_.xsize_groups; + const size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t xgroupdim = (frame_dimensions_.group_dim << base_color_shift) >> + channel_shifts_[0][channel].first; + size_t ygroupdim = (frame_dimensions_.group_dim << base_color_shift) >> + channel_shifts_[0][channel].second; + return Rect( + kRenderPipelineXOffset + gx * xgroupdim, + kRenderPipelineXOffset + gy * ygroupdim, xgroupdim, ygroupdim, + kRenderPipelineXOffset + DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[0][channel].first), + kRenderPipelineXOffset + + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[0][channel].second)); +} + +std::vector<std::pair<ImageF*, Rect>> SimpleRenderPipeline::PrepareBuffers( + size_t group_id, size_t thread_id) { + std::vector<std::pair<ImageF*, Rect>> ret; + for (size_t c = 0; c < channel_data_.size(); c++) { + ret.emplace_back(&channel_data_[c], MakeChannelRect(group_id, c)); + } + return ret; +} + +void SimpleRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) { + for (size_t c = 0; c < channel_data_.size(); c++) { + Rect r = MakeChannelRect(group_id, c); + (void)r; + JXL_CHECK_PLANE_INITIALIZED(channel_data_[c], r, c); + } + + if (PassesWithAllInput() <= processed_passes_) return; + processed_passes_++; + + for (size_t stage_id = 0; stage_id < stages_.size(); stage_id++) { + const auto& stage = stages_[stage_id]; + // Prepare buffers for kInOut channels. + std::vector<ImageF> new_channels(channel_data_.size()); + std::vector<ImageF*> output_channels(channel_data_.size()); + + std::vector<std::pair<size_t, size_t>> input_sizes(channel_data_.size()); + for (size_t c = 0; c < channel_data_.size(); c++) { + input_sizes[c] = + std::make_pair(channel_data_[c].xsize() - kRenderPipelineXOffset * 2, + channel_data_[c].ysize() - kRenderPipelineXOffset * 2); + } + + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) { + continue; + } + // Ensure that the newly allocated channels are large enough to avoid + // problems with padding. + new_channels[c] = + ImageF(frame_dimensions_.xsize_upsampled_padded + + kRenderPipelineXOffset * 2 + hwy::kMaxVectorSize * 8, + frame_dimensions_.ysize_upsampled_padded + + kRenderPipelineXOffset * 2); + new_channels[c].ShrinkTo( + (input_sizes[c].first << stage->settings_.shift_x) + + kRenderPipelineXOffset * 2, + (input_sizes[c].second << stage->settings_.shift_y) + + kRenderPipelineXOffset * 2); + output_channels[c] = &new_channels[c]; + } + + auto get_row = [&](size_t c, int64_t y) { + return channel_data_[c].Row(kRenderPipelineXOffset + y) + + kRenderPipelineXOffset; + }; + + // Add mirrored pixes to all kInOut channels. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) { + continue; + } + // Horizontal mirroring. + for (size_t y = 0; y < input_sizes[c].second; y++) { + float* row = get_row(c, y); + for (size_t ix = 0; ix < stage->settings_.border_x; ix++) { + *(row - ix - 1) = row[Mirror(-ssize_t(ix) - 1, input_sizes[c].first)]; + } + for (size_t ix = 0; ix < stage->settings_.border_x; ix++) { + *(row + ix + input_sizes[c].first) = + row[Mirror(ix + input_sizes[c].first, input_sizes[c].first)]; + } + } + // Vertical mirroring. + for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) { + memcpy(get_row(c, -y - 1) - stage->settings_.border_x, + get_row(c, Mirror(-ssize_t(y) - 1, input_sizes[c].second)) - + stage->settings_.border_x, + sizeof(float) * + (input_sizes[c].first + 2 * stage->settings_.border_x)); + } + for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) { + memcpy( + get_row(c, input_sizes[c].second + y) - stage->settings_.border_x, + get_row(c, + Mirror(input_sizes[c].second + y, input_sizes[c].second)) - + stage->settings_.border_x, + sizeof(float) * + (input_sizes[c].first + 2 * stage->settings_.border_x)); + } + } + + size_t ysize = 0; + size_t xsize = 0; + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) { + continue; + } + ysize = std::max(input_sizes[c].second, ysize); + xsize = std::max(input_sizes[c].first, xsize); + } + + JXL_ASSERT(ysize != 0); + JXL_ASSERT(xsize != 0); + + RenderPipelineStage::RowInfo input_rows(channel_data_.size()); + RenderPipelineStage::RowInfo output_rows(channel_data_.size()); + + // Run the pipeline. + { + stage->SetInputSizes(input_sizes); + int border_y = stage->settings_.border_y; + for (size_t y = 0; y < ysize; y++) { + // Prepare input rows. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) { + continue; + } + input_rows[c].resize(2 * border_y + 1); + for (int iy = -border_y; iy <= border_y; iy++) { + input_rows[c][iy + border_y] = + channel_data_[c].Row(y + kRenderPipelineXOffset + iy); + } + } + // Prepare output rows. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (!output_channels[c]) continue; + output_rows[c].resize(1 << stage->settings_.shift_y); + for (size_t iy = 0; iy < output_rows[c].size(); iy++) { + output_rows[c][iy] = output_channels[c]->Row( + (y << stage->settings_.shift_y) + iy + kRenderPipelineXOffset); + } + } + stage->ProcessRow(input_rows, output_rows, /*xextra=*/0, xsize, + /*xpos=*/0, y, thread_id); + } + } + + // Move new channels to current channels. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) { + continue; + } + channel_data_[c] = std::move(new_channels[c]); + } + for (size_t c = 0; c < channel_data_.size(); c++) { + size_t next_stage = std::min(stage_id + 1, channel_shifts_.size() - 1); + size_t xsize = DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[next_stage][c].first); + size_t ysize = DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[next_stage][c].second); + channel_data_[c].ShrinkTo(xsize + 2 * kRenderPipelineXOffset, + ysize + 2 * kRenderPipelineXOffset); + JXL_CHECK_PLANE_INITIALIZED( + channel_data_[c], + Rect(kRenderPipelineXOffset, kRenderPipelineXOffset, xsize, ysize), + c); + } + + if (stage->SwitchToImageDimensions()) { + size_t image_xsize, image_ysize; + FrameOrigin frame_origin; + stage->GetImageDimensions(&image_xsize, &image_ysize, &frame_origin); + frame_dimensions_.Set(image_xsize, image_ysize, 0, 0, 0, false, 1); + std::vector<ImageF> old_channels = std::move(channel_data_); + channel_data_.clear(); + channel_data_.reserve(old_channels.size()); + for (size_t c = 0; c < old_channels.size(); c++) { + channel_data_.emplace_back(2 * kRenderPipelineXOffset + image_xsize, + 2 * kRenderPipelineXOffset + image_ysize); + } + for (size_t y = 0; y < image_ysize; ++y) { + for (size_t c = 0; c < channel_data_.size(); c++) { + output_rows[c].resize(1); + output_rows[c][0] = channel_data_[c].Row(kRenderPipelineXOffset + y); + } + // TODO(sboukortt): consider doing this only on the parts of the + // background that won't be occluded. + stage->ProcessPaddingRow(output_rows, image_xsize, 0, y); + } + ssize_t x0 = frame_origin.x0; + ssize_t y0 = frame_origin.y0; + size_t x0_fg = 0; + size_t y0_fg = 0; + if (x0 < 0) { + xsize += x0; + x0_fg -= x0; + x0 = 0; + } + if (x0 + xsize > image_xsize) { + xsize = image_xsize - x0; + } + if (y0 < 0) { + ysize += y0; + y0_fg -= x0; + y0 = 0; + } + if (y0 + ysize > image_ysize) { + ysize = image_ysize - y0; + } + const Rect rect_fg_relative_to_image = + Rect(x0, y0, xsize, ysize) + .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset); + const Rect rect_fg = + Rect(x0_fg, y0_fg, xsize, ysize) + .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset); + for (size_t c = 0; c < channel_data_.size(); c++) { + CopyImageTo(rect_fg, old_channels[c], rect_fg_relative_to_image, + &channel_data_[c]); + } + } + } +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h new file mode 100644 index 0000000000..10f4505912 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_ +#define LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_ + +#include <stdint.h> + +#include "lib/jxl/render_pipeline/render_pipeline.h" + +namespace jxl { + +// A RenderPipeline that is "obviously correct"; it may use potentially large +// amounts of memory and be slow. It is intended to be used mostly for testing +// purposes. +class SimpleRenderPipeline : public RenderPipeline { + std::vector<std::pair<ImageF*, Rect>> PrepareBuffers( + size_t group_id, size_t thread_id) override; + + void ProcessBuffers(size_t group_id, size_t thread_id) override; + + void PrepareForThreadsInternal(size_t num, bool use_group_ids) override; + + // Full frame buffers. Both X and Y dimensions are padded by + // kRenderPipelineXOffset. + std::vector<ImageF> channel_data_; + size_t processed_passes_ = 0; + + private: + Rect MakeChannelRect(size_t group_id, size_t channel); +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc new file mode 100644 index 0000000000..b68105f4c9 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc @@ -0,0 +1,250 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_blending.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_blending.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/blending.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class BlendingStage : public RenderPipelineStage { + public: + explicit BlendingStage(const FrameHeader& frame_header, + const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding) + : RenderPipelineStage(RenderPipelineStage::Settings()), + frame_header_(frame_header), + state_(*dec_state->shared) { + image_xsize_ = frame_header_.nonserialized_metadata->xsize(); + image_ysize_ = frame_header_.nonserialized_metadata->ysize(); + extra_channel_info_ = + &frame_header_.nonserialized_metadata->m.extra_channel_info; + info_ = frame_header_.blending_info; + const std::vector<BlendingInfo>& ec_info = + frame_header_.extra_channel_blending_info; + const ImageBundle& bg = state_.reference_frames[info_.source].frame; + bg_ = &bg; + if (bg.xsize() == 0 || bg.ysize() == 0) { + zeroes_.resize(image_xsize_, 0.f); + } else if (state_.reference_frames[info_.source].ib_is_in_xyb) { + initialized_ = JXL_FAILURE( + "Trying to blend XYB reference frame %i and non-XYB frame", + info_.source); + return; + } else if (std::any_of(ec_info.begin(), ec_info.end(), + [this](const BlendingInfo& info) { + const ImageBundle& bg = + state_.reference_frames[info.source].frame; + return bg.xsize() == 0 || bg.ysize() == 0; + })) { + zeroes_.resize(image_xsize_, 0.f); + } + + auto verify_bg_size = [&](const ImageBundle& bg) -> Status { + if (bg.xsize() != 0 && bg.ysize() != 0 && + (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ || + bg.origin.x0 != 0 || bg.origin.y0 != 0)) { + return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS + " crop as a background", + bg.xsize(), bg.ysize()); + } + return true; + }; + + Status ok = verify_bg_size(bg); + for (const auto& info : ec_info) { + const ImageBundle& bg = state_.reference_frames[info.source].frame; + if (!!ok) ok = verify_bg_size(bg); + } + if (!ok) { + initialized_ = ok; + return; + } + + if (state_.metadata->m.xyb_encoded) { + if (!dec_state->output_encoding_info.color_encoding_is_original) { + initialized_ = JXL_FAILURE("Blending in unsupported color space"); + return; + } + } + + blending_info_.resize(ec_info.size() + 1); + auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) { + pb->alpha_channel = info.alpha_channel; + pb->clamp = info.clamp; + switch (info.mode) { + case BlendMode::kReplace: { + pb->mode = PatchBlendMode::kReplace; + break; + } + case BlendMode::kAdd: { + pb->mode = PatchBlendMode::kAdd; + break; + } + case BlendMode::kMul: { + pb->mode = PatchBlendMode::kMul; + break; + } + case BlendMode::kBlend: { + pb->mode = PatchBlendMode::kBlendAbove; + break; + } + case BlendMode::kAlphaWeightedAdd: { + pb->mode = PatchBlendMode::kAlphaWeightedAddAbove; + break; + } + default: { + JXL_UNREACHABLE( + "Invalid blend mode"); // should have failed to decode + } + } + }; + make_blending(info_, &blending_info_[0]); + for (size_t i = 0; i < ec_info.size(); i++) { + make_blending(ec_info[i], &blending_info_[1 + i]); + } + } + + Status IsInitialized() const override { return initialized_; } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + JXL_ASSERT(initialized_); + const FrameOrigin& frame_origin = frame_header_.frame_origin; + ssize_t bg_xpos = frame_origin.x0 + static_cast<ssize_t>(xpos); + ssize_t bg_ypos = frame_origin.y0 + static_cast<ssize_t>(ypos); + int offset = 0; + if (bg_xpos + static_cast<ssize_t>(xsize) <= 0 || + frame_origin.x0 >= static_cast<ssize_t>(image_xsize_) || bg_ypos < 0 || + bg_ypos >= static_cast<ssize_t>(image_ysize_)) { + return; + } + if (bg_xpos < 0) { + offset -= bg_xpos; + xsize += bg_xpos; + bg_xpos = 0; + } + if (bg_xpos + xsize > image_xsize_) { + xsize = + std::max<ssize_t>(0, static_cast<ssize_t>(image_xsize_) - bg_xpos); + } + std::vector<const float*> bg_row_ptrs_(input_rows.size()); + std::vector<float*> fg_row_ptrs_(input_rows.size()); + size_t num_c = std::min(input_rows.size(), extra_channel_info_->size() + 3); + for (size_t c = 0; c < num_c; ++c) { + fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset; + if (c < 3) { + bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0 + ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos + : zeroes_.data(); + } else { + const ImageBundle& ec_bg = + state_ + .reference_frames + [frame_header_.extra_channel_blending_info[c - 3].source] + .frame; + bg_row_ptrs_[c] = + ec_bg.xsize() != 0 && ec_bg.ysize() != 0 + ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos + : zeroes_.data(); + } + } + PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(), + fg_row_ptrs_.data(), 0, xsize, blending_info_[0], + blending_info_.data() + 1, *extra_channel_info_); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInPlace; + } + + bool SwitchToImageDimensions() const override { return true; } + + void GetImageDimensions(size_t* xsize, size_t* ysize, + FrameOrigin* frame_origin) const override { + *xsize = image_xsize_; + *ysize = image_ysize_; + *frame_origin = frame_header_.frame_origin; + } + + void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, size_t xpos, + size_t ypos) const override { + if (bg_->xsize() == 0 || bg_->ysize() == 0) { + for (size_t c = 0; c < 3; ++c) { + memset(GetInputRow(output_rows, c, 0), 0, xsize * sizeof(float)); + } + } else { + for (size_t c = 0; c < 3; ++c) { + memcpy(GetInputRow(output_rows, c, 0), + bg_->color().ConstPlaneRow(c, ypos) + xpos, + xsize * sizeof(float)); + } + } + for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) { + const ImageBundle& ec_bg = + state_ + .reference_frames[frame_header_.extra_channel_blending_info[ec] + .source] + .frame; + if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) { + memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float)); + } else { + memcpy(GetInputRow(output_rows, 3 + ec, 0), + ec_bg.extra_channels()[ec].ConstRow(ypos) + xpos, + xsize * sizeof(float)); + } + } + } + + const char* GetName() const override { return "Blending"; } + + private: + const FrameHeader& frame_header_; + const PassesSharedState& state_; + BlendingInfo info_; + const ImageBundle* bg_; + Status initialized_ = true; + size_t image_xsize_; + size_t image_ysize_; + std::vector<PatchBlending> blending_info_; + const std::vector<ExtraChannelInfo>* extra_channel_info_; + std::vector<float> zeroes_; +}; + +std::unique_ptr<RenderPipelineStage> GetBlendingStage( + const FrameHeader& frame_header, const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding) { + return jxl::make_unique<BlendingStage>(frame_header, dec_state, + frame_color_encoding); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetBlendingStage); + +std::unique_ptr<RenderPipelineStage> GetBlendingStage( + const FrameHeader& frame_header, const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding) { + return HWY_DYNAMIC_DISPATCH(GetBlendingStage)(frame_header, dec_state, + frame_color_encoding); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h new file mode 100644 index 0000000000..aedc8c2e99 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h @@ -0,0 +1,25 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_ + +#include <memory> + +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies blending if applicable. +std::unique_ptr<RenderPipelineStage> GetBlendingStage( + const FrameHeader& frame_header, const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc new file mode 100644 index 0000000000..936fbd3a44 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc @@ -0,0 +1,127 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_chroma_upsampling.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/simd_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; + +class HorizontalChromaUpsamplingStage : public RenderPipelineStage { + public: + explicit HorizontalChromaUpsamplingStage(size_t channel) + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX( + /*shift=*/1, /*border=*/1)), + c_(channel) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + HWY_FULL(float) df; + xextra = RoundUpTo(xextra, Lanes(df)); + auto threefour = Set(df, 0.75f); + auto onefour = Set(df, 0.25f); + const float* row_in = GetInputRow(input_rows, c_, 0); + float* row_out = GetOutputRow(output_rows, c_, 0); + for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra); + x += Lanes(df)) { + auto current = Mul(LoadU(df, row_in + x), threefour); + auto prev = LoadU(df, row_in + x - 1); + auto next = LoadU(df, row_in + x + 1); + auto left = MulAdd(onefour, prev, current); + auto right = MulAdd(onefour, next, current); + StoreInterleaved(df, left, right, row_out + x * 2); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c == c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "HChromaUps"; } + + private: + size_t c_; +}; + +class VerticalChromaUpsamplingStage : public RenderPipelineStage { + public: + explicit VerticalChromaUpsamplingStage(size_t channel) + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY( + /*shift=*/1, /*border=*/1)), + c_(channel) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + HWY_FULL(float) df; + xextra = RoundUpTo(xextra, Lanes(df)); + auto threefour = Set(df, 0.75f); + auto onefour = Set(df, 0.25f); + const float* row_top = GetInputRow(input_rows, c_, -1); + const float* row_mid = GetInputRow(input_rows, c_, 0); + const float* row_bot = GetInputRow(input_rows, c_, 1); + float* row_out0 = GetOutputRow(output_rows, c_, 0); + float* row_out1 = GetOutputRow(output_rows, c_, 1); + for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra); + x += Lanes(df)) { + auto it = LoadU(df, row_top + x); + auto im = LoadU(df, row_mid + x); + auto ib = LoadU(df, row_bot + x); + auto im_scaled = Mul(im, threefour); + Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); + Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c == c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "VChromaUps"; } + + private: + size_t c_; +}; + +std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel, + bool horizontal) { + if (horizontal) { + return jxl::make_unique<HorizontalChromaUpsamplingStage>(channel); + } else { + return jxl::make_unique<VerticalChromaUpsamplingStage>(channel); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetChromaUpsamplingStage); + +std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel, + bool horizontal) { + return HWY_DYNAMIC_DISPATCH(GetChromaUpsamplingStage)(channel, horizontal); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h new file mode 100644 index 0000000000..b4d0cbdfd3 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_ +#include <math.h> +#include <stdint.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies simple upsampling, either horizontal or vertical, to the given +// channel. +std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel, + bool horizontal); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc new file mode 100644 index 0000000000..2465146b47 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc @@ -0,0 +1,134 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_cms.h" + +#include <memory> + +#include "jxl/cms_interface.h" +#include "jxl/color_encoding.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_xyb.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_cms.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class CmsStage : public RenderPipelineStage { + public: + explicit CmsStage(OutputEncodingInfo output_encoding_info) + : RenderPipelineStage(RenderPipelineStage::Settings()), + output_encoding_info_(std::move(output_encoding_info)) { + c_src_ = output_encoding_info_.linear_color_encoding; + } + + bool IsNeeded() const { + const size_t channels_src = (c_src_.IsCMYK() ? 4 : c_src_.Channels()); + const size_t channels_dst = output_encoding_info_.color_encoding.Channels(); + const bool not_mixing_color_and_grey = + (channels_src == channels_dst || + (channels_src == 4 && channels_dst == 3)); + return (output_encoding_info_.cms_set) && + !c_src_.SameColorEncoding(output_encoding_info_.color_encoding) && + not_mixing_color_and_grey; + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + JXL_ASSERT(xsize == xsize_); + // TODO(firsching): handle grey case seperately + // interleave + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + float* mutable_buf_src = color_space_transform->BufSrc(thread_id); + + for (size_t x = 0; x < xsize; x++) { + mutable_buf_src[3 * x + 0] = row0[x]; + mutable_buf_src[3 * x + 1] = row1[x]; + mutable_buf_src[3 * x + 2] = row2[x]; + } + const float* buf_src = mutable_buf_src; + float* JXL_RESTRICT buf_dst = color_space_transform->BufDst(thread_id); + if (!color_space_transform->Run(thread_id, buf_src, buf_dst)) { + // TODO(firsching): somehow mark failing here? + return; + } + // de-interleave + for (size_t x = 0; x < xsize; x++) { + row0[x] = buf_dst[3 * x + 0]; + row1[x] = buf_dst[3 * x + 1]; + row2[x] = buf_dst[3 * x + 2]; + } + } + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Cms"; } + + private: + OutputEncodingInfo output_encoding_info_; + size_t xsize_; + std::unique_ptr<jxl::ColorSpaceTransform> color_space_transform; + ColorEncoding c_src_; + + void SetInputSizes( + const std::vector<std::pair<size_t, size_t>>& input_sizes) override { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(input_sizes.size() >= 3); + for (size_t c = 1; c < input_sizes.size(); c++) { + JXL_ASSERT(input_sizes[c].first == input_sizes[0].first); + JXL_ASSERT(input_sizes[c].second == input_sizes[0].second); + } +#endif + xsize_ = input_sizes[0].first; + } + + Status PrepareForThreads(size_t num_threads) override { + color_space_transform = jxl::make_unique<jxl::ColorSpaceTransform>( + output_encoding_info_.color_management_system); + JXL_RETURN_IF_ERROR(color_space_transform->Init( + c_src_, output_encoding_info_.color_encoding, + output_encoding_info_.desired_intensity_target, xsize_, num_threads)); + return true; + } +}; + +std::unique_ptr<RenderPipelineStage> GetCmsStage( + const OutputEncodingInfo& output_encoding_info) { + auto stage = jxl::make_unique<CmsStage>(output_encoding_info); + if (!stage->IsNeeded()) return nullptr; + return stage; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetCmsStage); + +std::unique_ptr<RenderPipelineStage> GetCmsStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetCmsStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h new file mode 100644 index 0000000000..23277ae6f7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_ + +#include <memory> + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +std::unique_ptr<RenderPipelineStage> GetCmsStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc new file mode 100644 index 0000000000..5d1a379ede --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc @@ -0,0 +1,526 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_epf.h" + +#include "lib/jxl/base/common.h" +#include "lib/jxl/common.h" // JXL_HIGH_PRECISION +#include "lib/jxl/epf.h" +#include "lib/jxl/sanitizers.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_epf.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +// TODO(veluca): In principle, vectors could be not capped, if we want to deal +// with having two different sigma values in a single vector. +using DF = HWY_CAPPED(float, 8); + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::AbsDiff; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::VFromD; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +JXL_INLINE Vec<DF> Weight(Vec<DF> sad, Vec<DF> inv_sigma, Vec<DF> thres) { + auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f)); + return ZeroIfNegative(v); +} + +// 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 plus-shaped). So this makes +// this filter a 7x7 filter. +class EPF0Stage : public RenderPipelineStage { + public: + EPF0Stage(const LoopFilter& lf, const ImageF& sigma) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/3)), + lf_(lf), + sigma_(&sigma) {} + + template <bool aligned> + JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][7], ssize_t x, + Vec<DF> sad, Vec<DF> inv_sigma, + Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y, + Vec<DF>* JXL_RESTRICT B, + Vec<DF>* JXL_RESTRICT w) const { + auto cx = aligned ? Load(DF(), rows[0][3 + row] + x) + : LoadU(DF(), rows[0][3 + row] + x); + auto cy = aligned ? Load(DF(), rows[1][3 + row] + x) + : LoadU(DF(), rows[1][3 + row] + x); + auto cb = aligned ? Load(DF(), rows[2][3 + row] + x) + : LoadU(DF(), rows[2][3 + row] + x); + + auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush)); + *w = Add(*w, weight); + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + DF df; + + using V = decltype(Zero(df)); + V t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA, tB; + V* sads[12] = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA, &tB}; + + xextra = RoundUpTo(xextra, Lanes(df)); + const float* JXL_RESTRICT row_sigma = + sigma_->Row(ypos / kBlockDim + kSigmaPadding); + + float sm = lf_.epf_pass0_sigma_scale * 1.65; + float bsm = sm * lf_.epf_border_sad_mul; + + HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm, + sm, sm, sm, bsm}; + HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm, + bsm, bsm, bsm, bsm}; + float* JXL_RESTRICT rows[3][7]; + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 7; i++) { + rows[c][i] = GetInputRow(input_rows, c, i - 3); + } + } + + const float* sad_mul = + (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1) + ? sad_mul_border + : sad_mul_center; + + for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra); + x += Lanes(df)) { + size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim; + size_t ix = (x + xpos) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows[c][3 + 0] + x); + StoreU(px, df, GetOutputRow(output_rows, c, 0) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm); + + for (size_t i = 0; i < 12; i++) *sads[i] = Zero(df); + constexpr std::array<int, 2> sads_off[12] = { + {{-2, 0}}, {{-1, -1}}, {{-1, 0}}, {{-1, 1}}, {{0, -2}}, {{0, -1}}, + {{0, 1}}, {{0, 2}}, {{1, -1}}, {{1, 0}}, {{1, 1}}, {{2, 0}}, + }; + + // compute sads + // TODO(veluca): consider unrolling and optimizing this. + for (size_t c = 0; c < 3; c++) { + auto scale = Set(df, lf_.epf_channel_scale[c]); + for (size_t i = 0; i < 12; i++) { + auto sad = Zero(df); + constexpr std::array<int, 2> plus_off[] = { + {{0, 0}}, {{-1, 0}}, {{0, -1}}, {{1, 0}}, {{0, 1}}}; + for (size_t j = 0; j < 5; j++) { + const auto r11 = + LoadU(df, rows[c][3 + plus_off[j][0]] + x + plus_off[j][1]); + const auto c11 = + LoadU(df, rows[c][3 + sads_off[i][0] + plus_off[j][0]] + x + + sads_off[i][1] + plus_off[j][1]); + sad = Add(sad, AbsDiff(r11, c11)); + } + *sads[i] = MulAdd(sad, scale, *sads[i]); + } + } + const auto x_cc = Load(df, rows[0][3 + 0] + x); + const auto y_cc = Load(df, rows[1][3 + 0] + x); + const auto b_cc = Load(df, rows[2][3 + 0] + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + for (size_t i = 0; i < 12; i++) { + AddPixel</*aligned=*/false>(/*row=*/sads_off[i][0], rows, + x + sads_off[i][1], *sads[i], inv_sigma, &X, + &Y, &B, &w); + } +#if JXL_HIGH_PRECISION + auto inv_w = Div(Set(df, 1.0f), w); +#else + auto inv_w = ApproximateReciprocal(w); +#endif + StoreU(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x); + StoreU(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x); + StoreU(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "EPF0"; } + + private: + LoopFilter lf_; + const ImageF* sigma_; +}; + +// 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 plus-shaped). So this +// makes this filter a 5x5 filter. +class EPF1Stage : public RenderPipelineStage { + public: + EPF1Stage(const LoopFilter& lf, const ImageF& sigma) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/2)), + lf_(lf), + sigma_(&sigma) {} + + template <bool aligned> + JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][5], ssize_t x, + Vec<DF> sad, Vec<DF> inv_sigma, + Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y, + Vec<DF>* JXL_RESTRICT B, + Vec<DF>* JXL_RESTRICT w) const { + auto cx = aligned ? Load(DF(), rows[0][2 + row] + x) + : LoadU(DF(), rows[0][2 + row] + x); + auto cy = aligned ? Load(DF(), rows[1][2 + row] + x) + : LoadU(DF(), rows[1][2 + row] + x); + auto cb = aligned ? Load(DF(), rows[2][2 + row] + x) + : LoadU(DF(), rows[2][2 + row] + x); + + auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush)); + *w = Add(*w, weight); + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + DF df; + xextra = RoundUpTo(xextra, Lanes(df)); + const float* JXL_RESTRICT row_sigma = + sigma_->Row(ypos / kBlockDim + kSigmaPadding); + + float sm = 1.65f; + float bsm = sm * lf_.epf_border_sad_mul; + + HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm, + sm, sm, sm, bsm}; + HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm, + bsm, bsm, bsm, bsm}; + + float* JXL_RESTRICT rows[3][5]; + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 5; i++) { + rows[c][i] = GetInputRow(input_rows, c, i - 2); + } + } + + const float* sad_mul = + (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1) + ? sad_mul_border + : sad_mul_center; + + for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra); + x += Lanes(df)) { + size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim; + size_t ix = (x + xpos) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows[c][2 + 0] + x); + Store(px, df, GetOutputRow(output_rows, c, 0) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm); + auto sad0 = Zero(df); + auto sad1 = Zero(df); + auto sad2 = Zero(df); + auto sad3 = Zero(df); + + // compute sads + for (size_t c = 0; c < 3; c++) { + // center px = 22, px above = 21 + auto t = Undefined(df); + + const auto p20 = Load(df, rows[c][2 + -2] + x); + const auto p21 = Load(df, rows[c][2 + -1] + x); + auto sad0c = AbsDiff(p20, p21); // SAD 2, 1 + + const auto p11 = LoadU(df, rows[c][2 + -1] + x - 1); + auto sad1c = AbsDiff(p11, p21); // SAD 1, 2 + + const auto p31 = LoadU(df, rows[c][2 + -1] + x + 1); + auto sad2c = AbsDiff(p31, p21); // SAD 3, 2 + + const auto p02 = LoadU(df, rows[c][2 + 0] + x - 2); + const auto p12 = LoadU(df, rows[c][2 + 0] + x - 1); + sad1c = Add(sad1c, AbsDiff(p02, p12)); // SAD 1, 2 + sad0c = Add(sad0c, AbsDiff(p11, p12)); // SAD 2, 1 + + const auto p22 = LoadU(df, rows[c][2 + 0] + x); + t = AbsDiff(p12, p22); + sad1c = Add(sad1c, t); // SAD 1, 2 + sad2c = Add(sad2c, t); // SAD 3, 2 + t = AbsDiff(p22, p21); + auto sad3c = t; // SAD 2, 3 + sad0c = Add(sad0c, t); // SAD 2, 1 + + const auto p32 = LoadU(df, rows[c][2 + 0] + x + 1); + sad0c = Add(sad0c, AbsDiff(p31, p32)); // SAD 2, 1 + t = AbsDiff(p22, p32); + sad1c = Add(sad1c, t); // SAD 1, 2 + sad2c = Add(sad2c, t); // SAD 3, 2 + + const auto p42 = LoadU(df, rows[c][2 + 0] + x + 2); + sad2c = Add(sad2c, AbsDiff(p42, p32)); // SAD 3, 2 + + const auto p13 = LoadU(df, rows[c][2 + 1] + x - 1); + sad3c = Add(sad3c, AbsDiff(p13, p12)); // SAD 2, 3 + + const auto p23 = Load(df, rows[c][2 + 1] + x); + t = AbsDiff(p22, p23); + sad0c = Add(sad0c, t); // SAD 2, 1 + sad3c = Add(sad3c, t); // SAD 2, 3 + sad1c = Add(sad1c, AbsDiff(p13, p23)); // SAD 1, 2 + + const auto p33 = LoadU(df, rows[c][2 + 1] + x + 1); + sad2c = Add(sad2c, AbsDiff(p33, p23)); // SAD 3, 2 + sad3c = Add(sad3c, AbsDiff(p33, p32)); // SAD 2, 3 + + const auto p24 = Load(df, rows[c][2 + 2] + x); + sad3c = Add(sad3c, AbsDiff(p24, p23)); // SAD 2, 3 + + auto scale = Set(df, lf_.epf_channel_scale[c]); + sad0 = MulAdd(sad0c, scale, sad0); + sad1 = MulAdd(sad1c, scale, sad1); + sad2 = MulAdd(sad2c, scale, sad2); + sad3 = MulAdd(sad3c, scale, sad3); + } + const auto x_cc = Load(df, rows[0][2 + 0] + x); + const auto y_cc = Load(df, rows[1][2 + 0] + x); + const auto b_cc = Load(df, rows[2][2 + 0] + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + // Top row + AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, sad0, inv_sigma, &X, &Y, + &B, &w); + // Center + AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, sad1, inv_sigma, &X, + &Y, &B, &w); + AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, sad2, inv_sigma, &X, + &Y, &B, &w); + // Bottom + AddPixel</*aligned=*/true>(/*row=*/1, rows, x, sad3, inv_sigma, &X, &Y, + &B, &w); +#if JXL_HIGH_PRECISION + auto inv_w = Div(Set(df, 1.0f), w); +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x); + Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x); + Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "EPF1"; } + + private: + LoopFilter lf_; + const ImageF* sigma_; +}; + +// 3x3 plus-shaped kernel with 1 SAD per pixel. So this makes this filter a 3x3 +// filter. +class EPF2Stage : public RenderPipelineStage { + public: + EPF2Stage(const LoopFilter& lf, const ImageF& sigma) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/1)), + lf_(lf), + sigma_(&sigma) {} + + template <bool aligned> + JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][3], ssize_t x, + Vec<DF> rx, Vec<DF> ry, Vec<DF> rb, + Vec<DF> inv_sigma, Vec<DF>* JXL_RESTRICT X, + Vec<DF>* JXL_RESTRICT Y, Vec<DF>* JXL_RESTRICT B, + Vec<DF>* JXL_RESTRICT w) const { + auto cx = aligned ? Load(DF(), rows[0][1 + row] + x) + : LoadU(DF(), rows[0][1 + row] + x); + auto cy = aligned ? Load(DF(), rows[1][1 + row] + x) + : LoadU(DF(), rows[1][1 + row] + x); + auto cb = aligned ? Load(DF(), rows[2][1 + row] + x) + : LoadU(DF(), rows[2][1 + row] + x); + + auto sad = Mul(AbsDiff(cx, rx), Set(DF(), lf_.epf_channel_scale[0])); + sad = MulAdd(AbsDiff(cy, ry), Set(DF(), lf_.epf_channel_scale[1]), sad); + sad = MulAdd(AbsDiff(cb, rb), Set(DF(), lf_.epf_channel_scale[2]), sad); + + auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass2_zeroflush)); + + *w = Add(*w, weight); + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + DF df; + xextra = RoundUpTo(xextra, Lanes(df)); + const float* JXL_RESTRICT row_sigma = + sigma_->Row(ypos / kBlockDim + kSigmaPadding); + + float sm = lf_.epf_pass2_sigma_scale * 1.65; + float bsm = sm * lf_.epf_border_sad_mul; + + HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm, + sm, sm, sm, bsm}; + HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm, + bsm, bsm, bsm, bsm}; + + float* JXL_RESTRICT rows[3][3]; + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 3; i++) { + rows[c][i] = GetInputRow(input_rows, c, i - 1); + } + } + + const float* sad_mul = + (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1) + ? sad_mul_border + : sad_mul_center; + + for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra); + x += Lanes(df)) { + size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim; + size_t ix = (x + xpos) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows[c][1 + 0] + x); + Store(px, df, GetOutputRow(output_rows, c, 0) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm); + + const auto x_cc = Load(df, rows[0][1 + 0] + x); + const auto y_cc = Load(df, rows[1][1 + 0] + x); + const auto b_cc = Load(df, rows[2][1 + 0] + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + // Top row + AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); + // Center + AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); + AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); + // Bottom + AddPixel</*aligned=*/true>(/*row=*/1, rows, x, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); +#if JXL_HIGH_PRECISION + auto inv_w = Div(Set(df, 1.0f), w); +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x); + Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x); + Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "EPF2"; } + + private: + LoopFilter lf_; + const ImageF* sigma_; +}; + +std::unique_ptr<RenderPipelineStage> GetEPFStage0(const LoopFilter& lf, + const ImageF& sigma) { + return jxl::make_unique<EPF0Stage>(lf, sigma); +} + +std::unique_ptr<RenderPipelineStage> GetEPFStage1(const LoopFilter& lf, + const ImageF& sigma) { + return jxl::make_unique<EPF1Stage>(lf, sigma); +} + +std::unique_ptr<RenderPipelineStage> GetEPFStage2(const LoopFilter& lf, + const ImageF& sigma) { + return jxl::make_unique<EPF2Stage>(lf, sigma); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetEPFStage0); +HWY_EXPORT(GetEPFStage1); +HWY_EXPORT(GetEPFStage2); + +std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf, + const ImageF& sigma, + size_t epf_stage) { + JXL_ASSERT(lf.epf_iters != 0); + switch (epf_stage) { + case 0: + return HWY_DYNAMIC_DISPATCH(GetEPFStage0)(lf, sigma); + case 1: + return HWY_DYNAMIC_DISPATCH(GetEPFStage1)(lf, sigma); + case 2: + return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma); + default: + JXL_UNREACHABLE("Invalid EPF stage"); + } +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h new file mode 100644 index 0000000000..c9d0d0c785 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_ +#include <math.h> +#include <stdint.h> +#include <stdio.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/image.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies the `epf_stage`-th EPF step with the given settings and `sigma`. +// `sigma` will be accessed with an offset of (kSigmaPadding, kSigmaPadding), +// and should have (kSigmaBorder, kSigmaBorder) mirrored sigma values available +// around the main image. See also filters.(h|cc) +std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf, + const ImageF& sigma, + size_t epf_stage); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc new file mode 100644 index 0000000000..6b1f646cd5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc @@ -0,0 +1,194 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_from_linear.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_from_linear.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/cms/tone_mapping-inl.h" +#include "lib/jxl/cms/transfer_functions-inl.h" +#include "lib/jxl/common.h" // JXL_HIGH_PRECISION +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::IfThenZeroElse; + +template <typename Op> +struct PerChannelOp { + explicit PerChannelOp(Op op) : op(op) {} + template <typename D, typename T> + void Transform(D d, T* r, T* g, T* b) const { + *r = op.Transform(d, *r); + *g = op.Transform(d, *g); + *b = op.Transform(d, *b); + } + + Op op; +}; +template <typename Op> +PerChannelOp<Op> MakePerChannelOp(Op&& op) { + return PerChannelOp<Op>(std::forward<Op>(op)); +} + +struct OpLinear { + template <typename D, typename T> + T Transform(D d, const T& linear) const { + return linear; + } +}; + +struct OpRgb { + template <typename D, typename T> + T Transform(D d, const T& linear) const { +#if JXL_HIGH_PRECISION + return TF_SRGB().EncodedFromDisplay(d, linear); +#else + return FastLinearToSRGB(d, linear); +#endif + } +}; + +struct OpPq { + explicit OpPq(const float intensity_target) : tf_pq_(intensity_target) {} + template <typename D, typename T> + T Transform(D d, const T& linear) const { + return tf_pq_.EncodedFromDisplay(d, linear); + } + TF_PQ tf_pq_; +}; + +struct OpHlg { + explicit OpHlg(const float luminances[3], const float intensity_target) + : hlg_ootf_(HlgOOTF::ToSceneLight(/*display_luminance=*/intensity_target, + luminances)) {} + + template <typename D, typename T> + void Transform(D d, T* r, T* g, T* b) const { + hlg_ootf_.Apply(r, g, b); + *r = TF_HLG().EncodedFromDisplay(d, *r); + *g = TF_HLG().EncodedFromDisplay(d, *g); + *b = TF_HLG().EncodedFromDisplay(d, *b); + } + HlgOOTF hlg_ootf_; +}; + +struct Op709 { + template <typename D, typename T> + T Transform(D d, const T& linear) const { + return TF_709().EncodedFromDisplay(d, linear); + } +}; + +struct OpGamma { + const float inverse_gamma; + template <typename D, typename T> + T Transform(D d, const T& linear) const { + return IfThenZeroElse(Le(linear, Set(d, 1e-5f)), + FastPowf(d, linear, Set(d, inverse_gamma))); + } +}; + +template <typename Op> +class FromLinearStage : public RenderPipelineStage { + public: + explicit FromLinearStage(Op op) + : RenderPipelineStage(RenderPipelineStage::Settings()), + op_(std::move(op)) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + const HWY_FULL(float) d; + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + auto r = LoadU(d, row0 + x); + auto g = LoadU(d, row1 + x); + auto b = LoadU(d, row2 + x); + op_.Transform(d, &r, &g, &b); + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "FromLinear"; } + + private: + Op op_; +}; + +template <typename Op> +std::unique_ptr<FromLinearStage<Op>> MakeFromLinearStage(Op&& op) { + return jxl::make_unique<FromLinearStage<Op>>(std::forward<Op>(op)); +} + +std::unique_ptr<RenderPipelineStage> GetFromLinearStage( + const OutputEncodingInfo& output_encoding_info) { + const auto& tf = output_encoding_info.color_encoding.Tf(); + if (tf.IsLinear()) { + return MakeFromLinearStage(MakePerChannelOp(OpLinear())); + } else if (tf.IsSRGB()) { + return MakeFromLinearStage(MakePerChannelOp(OpRgb())); + } else if (tf.IsPQ()) { + return MakeFromLinearStage( + MakePerChannelOp(OpPq(output_encoding_info.orig_intensity_target))); + } else if (tf.IsHLG()) { + return MakeFromLinearStage( + OpHlg(output_encoding_info.luminances, + output_encoding_info.desired_intensity_target)); + } else if (tf.Is709()) { + return MakeFromLinearStage(MakePerChannelOp(Op709())); + } else if (tf.have_gamma || tf.IsDCI()) { + return MakeFromLinearStage( + MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma})); + } else { + // This is a programming error. + JXL_UNREACHABLE("Invalid target encoding"); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetFromLinearStage); + +std::unique_ptr<RenderPipelineStage> GetFromLinearStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetFromLinearStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h new file mode 100644 index 0000000000..548ab50b8c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h @@ -0,0 +1,20 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_ + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from linear to the specified output encoding. +std::unique_ptr<RenderPipelineStage> GetFromLinearStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc new file mode 100644 index 0000000000..0917db3f9a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc @@ -0,0 +1,120 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_gaborish.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_gaborish.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; + +class GaborishStage : public RenderPipelineStage { + public: + explicit GaborishStage(const LoopFilter& lf) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/1)) { + weights_[0] = 1; + weights_[1] = lf.gab_x_weight1; + weights_[2] = lf.gab_x_weight2; + weights_[3] = 1; + weights_[4] = lf.gab_y_weight1; + weights_[5] = lf.gab_y_weight2; + weights_[6] = 1; + weights_[7] = lf.gab_b_weight1; + weights_[8] = lf.gab_b_weight2; + // Normalize + for (size_t c = 0; c < 3; c++) { + const float div = + weights_[3 * c] + 4 * (weights_[3 * c + 1] + weights_[3 * c + 2]); + const float mul = 1.0f / div; + weights_[3 * c] *= mul; + weights_[3 * c + 1] *= mul; + weights_[3 * c + 2] *= mul; + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + const HWY_FULL(float) d; + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1); + float* JXL_RESTRICT row_m = GetInputRow(input_rows, c, 0); + float* JXL_RESTRICT row_b = GetInputRow(input_rows, c, 1); + float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0); + const auto w0 = Set(d, weights_[3 * c + 0]); + const auto w1 = Set(d, weights_[3 * c + 1]); + const auto w2 = Set(d, weights_[3 * c + 2]); +// Group data need only be aligned to a block; for >=512 bit vectors, this may +// result in unaligned loads. +#if HWY_CAP_GE512 +#define LoadMaybeU LoadU +#else +#define LoadMaybeU Load +#endif + // Since GetInputRow(input_rows, c, {-1, 0, 1}) is aligned, rounding + // xextra up to Lanes(d) doesn't access anything problematic. + for (ssize_t x = -RoundUpTo(xextra, Lanes(d)); + x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto t = LoadMaybeU(d, row_t + x); + const auto tl = LoadU(d, row_t + x - 1); + const auto tr = LoadU(d, row_t + x + 1); + const auto m = LoadMaybeU(d, row_m + x); + const auto l = LoadU(d, row_m + x - 1); + const auto r = LoadU(d, row_m + x + 1); + const auto b = LoadMaybeU(d, row_b + x); + const auto bl = LoadU(d, row_b + x - 1); + const auto br = LoadU(d, row_b + x + 1); + const auto sum0 = m; + const auto sum1 = Add(Add(l, r), Add(t, b)); + const auto sum2 = Add(Add(tl, tr), Add(bl, br)); + auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, Mul(sum0, w0))); + Store(pixels, d, row_out + x); + } + } + } +#undef LoadMaybeU + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Gab"; } + + private: + float weights_[9]; +}; + +std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) { + return jxl::make_unique<GaborishStage>(lf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetGaborishStage); + +std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) { + JXL_ASSERT(lf.gab == 1); + return HWY_DYNAMIC_DISPATCH(GetGaborishStage)(lf); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h new file mode 100644 index 0000000000..55166e3ed8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h @@ -0,0 +1,24 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_ +#include <math.h> +#include <stdint.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies decoder-side Gaborish with the given settings. `lf.gab` must be 1. +std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc new file mode 100644 index 0000000000..5cf8a6ed51 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc @@ -0,0 +1,316 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_noise.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::Floor; +using hwy::HWY_NAMESPACE::Ge; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::TableLookupBytes; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +using D = HWY_CAPPED(float, kBlockDim); +using DI = hwy::HWY_NAMESPACE::Rebind<int32_t, D>; +using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>; + +// [0, max_value] +template <class D, class V> +static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) { + const auto clamped = Min(x, max_value); + return ZeroIfNegative(clamped); +} + +// x is in [0+delta, 1+delta], delta ~= 0.06 +template <class StrengthEval> +typename StrengthEval::V NoiseStrength(const StrengthEval& eval, + const typename StrengthEval::V x) { + return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f)); +} + +// TODO(veluca): SIMD-fy. +class StrengthEvalLut { + public: + using V = Vec<D>; + + explicit StrengthEvalLut(const NoiseParams& noise_params) +#if HWY_TARGET == HWY_SCALAR + : noise_params_(noise_params) +#endif + { +#if HWY_TARGET != HWY_SCALAR + uint32_t lut[8]; + memcpy(lut, noise_params.lut, sizeof(lut)); + for (size_t i = 0; i < 8; i++) { + low16_lut[2 * i] = (lut[i] >> 0) & 0xFF; + low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF; + high16_lut[2 * i] = (lut[i] >> 16) & 0xFF; + high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF; + } +#endif + } + + V operator()(const V vx) const { + constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2; + auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale))); + auto floor_x = Floor(scaled_vx); + auto frac_x = Sub(scaled_vx, floor_x); + floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale), + floor_x); + frac_x = + IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x); + auto floor_x_int = ConvertTo(DI(), floor_x); +#if HWY_TARGET == HWY_SCALAR + auto low = Set(D(), noise_params_.lut[floor_x_int.raw]); + auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]); +#else + // Set each lane's bytes to {0, 0, 2x+1, 2x}. + auto floorx_indices_low = + Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100)); + // Set each lane's bytes to {2x+1, 2x, 0, 0}. + auto floorx_indices_hi = + Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000)); + // load LUT + auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut)); + auto lowm = Set(DI(), 0xFFFF); + auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut)); + auto him = Set(DI(), 0xFFFF0000); + // low = noise_params.lut[floor_x] + auto low = + BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm), + And(TableLookupBytes(hi16, floorx_indices_hi), him))); + // hi = noise_params.lut[floor_x+1] + floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202)); + floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000)); + auto hi = + BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm), + And(TableLookupBytes(hi16, floorx_indices_hi), him))); +#endif + return MulAdd(Sub(hi, low), frac_x, low); + } + + private: +#if HWY_TARGET != HWY_SCALAR + // noise_params.lut transformed into two 16-bit lookup tables. + HWY_ALIGN uint8_t high16_lut[16]; + HWY_ALIGN uint8_t low16_lut[16]; +#else + const NoiseParams& noise_params_; +#endif +}; + +template <class D> +void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r, + const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor, + const Vec<D> noise_strength_g, const Vec<D> noise_strength_r, + float ytox, float ytob, float* JXL_RESTRICT out_x, + float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) { + const auto kRGCorr = Set(d, 0.9921875f); // 127/128 + const auto kRGNCorr = Set(d, 0.0078125f); // 1/128 + + const auto red_noise = + Mul(noise_strength_r, + MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor))); + const auto green_noise = + Mul(noise_strength_g, + MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor))); + + auto vx = LoadU(d, out_x); + auto vy = LoadU(d, out_y); + auto vb = LoadU(d, out_b); + + const auto rg_noise = Add(red_noise, green_noise); + vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx); + vy = Add(vy, rg_noise); + vb = MulAdd(Set(d, ytob), rg_noise, vb); + + StoreU(vx, d, out_x); + StoreU(vy, d, out_y); + StoreU(vb, d, out_b); +} + +class AddNoiseStage : public RenderPipelineStage { + public: + AddNoiseStage(const NoiseParams& noise_params, + const ColorCorrelationMap& cmap, size_t first_c) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/0)), + noise_params_(noise_params), + cmap_(cmap), + first_c_(first_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + if (!noise_params_.HasAny()) return; + const StrengthEvalLut noise_model(noise_params_); + D d; + const auto half = Set(d, 0.5f); + + // With the prior subtract-random Laplacian approximation, rnd_* ranges were + // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the + // normalizer is half of what it was before (0.5). + const auto norm_const = Set(d, 0.22f); + + float ytox = cmap_.YtoXRatio(0); + float ytob = cmap_.YtoBRatio(0); + + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + + float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0); + const float* JXL_RESTRICT row_rnd_r = + GetInputRow(input_rows, first_c_ + 0, 0); + const float* JXL_RESTRICT row_rnd_g = + GetInputRow(input_rows, first_c_ + 1, 0); + const float* JXL_RESTRICT row_rnd_c = + GetInputRow(input_rows, first_c_ + 2, 0); + // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and + // shuffles are otherwise done on the data, so this is safe. + msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + for (size_t x = 0; x < xsize_v; x += Lanes(d)) { + const auto vx = LoadU(d, row_x + x); + const auto vy = LoadU(d, row_y + x); + const auto in_g = Sub(vy, vx); + const auto in_r = Add(vy, vx); + const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half)); + const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half)); + const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const); + const auto addit_rnd_noise_green = + Mul(LoadU(d, row_rnd_g + x), norm_const); + const auto addit_rnd_noise_correlated = + Mul(LoadU(d, row_rnd_c + x), norm_const); + AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green, + addit_rnd_noise_correlated, noise_strength_g, + noise_strength_r, ytox, ytob, row_x + x, row_y + x, + row_b + x); + } + msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c >= first_c_ ? RenderPipelineChannelMode::kInput + : c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "AddNoise"; } + + private: + const NoiseParams& noise_params_; + const ColorCorrelationMap& cmap_; + size_t first_c_; +}; + +std::unique_ptr<RenderPipelineStage> GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start) { + return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start); +} + +class ConvolveNoiseStage : public RenderPipelineStage { + public: + explicit ConvolveNoiseStage(size_t first_c) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/2)), + first_c_(first_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + const HWY_FULL(float) d; + for (size_t c = first_c_; c < first_c_ + 3; c++) { + float* JXL_RESTRICT rows[5]; + for (size_t i = 0; i < 5; i++) { + rows[i] = GetInputRow(input_rows, c, i - 2); + } + float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0); + for (ssize_t x = -RoundUpTo(xextra, Lanes(d)); + x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto p00 = LoadU(d, rows[2] + x); + auto others = Zero(d); + // TODO(eustas): sum loaded values to reduce the calculation chain + for (ssize_t i = -2; i <= 2; i++) { + others = Add(others, LoadU(d, rows[0] + x + i)); + others = Add(others, LoadU(d, rows[1] + x + i)); + others = Add(others, LoadU(d, rows[3] + x + i)); + others = Add(others, LoadU(d, rows[4] + x + i)); + } + others = Add(others, LoadU(d, rows[2] + x - 2)); + others = Add(others, LoadU(d, rows[2] + x - 1)); + others = Add(others, LoadU(d, rows[2] + x + 1)); + others = Add(others, LoadU(d, rows[2] + x + 2)); + // 4 * (1 - box kernel) + auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84))); + StoreU(pixels, d, row_out + x); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c >= first_c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ConvNoise"; } + + private: + size_t first_c_; +}; + +std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage( + size_t noise_c_start) { + return jxl::make_unique<ConvolveNoiseStage>(noise_c_start); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetAddNoiseStage); +HWY_EXPORT(GetConvolveNoiseStage); + +std::unique_ptr<RenderPipelineStage> GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start) { + return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap, + noise_c_start); +} + +std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage( + size_t noise_c_start) { + return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h new file mode 100644 index 0000000000..bd7797f991 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_ +#include <math.h> +#include <stdint.h> +#include <stdio.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/dec_noise.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Adds noise to color channels. +std::unique_ptr<RenderPipelineStage> GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start); + +// Applies a 5x5 subtract-box-filter convolution to the noise input channels. +std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage( + size_t noise_c_start); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc new file mode 100644 index 0000000000..c5a75b09f7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_patches.h" + +namespace jxl { +namespace { +class PatchDictionaryStage : public RenderPipelineStage { + public: + PatchDictionaryStage(const PatchDictionary* patches, size_t num_channels) + : RenderPipelineStage(RenderPipelineStage::Settings()), + patches_(*patches), + num_channels_(num_channels) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + JXL_ASSERT(xpos == 0 || xpos >= xextra); + size_t x0 = xpos ? xpos - xextra : 0; + std::vector<float*> row_ptrs(num_channels_); + for (size_t i = 0; i < num_channels_; i++) { + row_ptrs[i] = GetInputRow(input_rows, i, 0) + x0 - xpos; + } + patches_.AddOneRow(row_ptrs.data(), ypos, x0, xsize + xextra + xpos - x0); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < num_channels_ ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Patches"; } + + private: + const PatchDictionary& patches_; + const size_t num_channels_; +}; +} // namespace + +std::unique_ptr<RenderPipelineStage> GetPatchesStage( + const PatchDictionary* patches, size_t num_channels) { + return jxl::make_unique<PatchDictionaryStage>(patches, num_channels); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h new file mode 100644 index 0000000000..b35abdc2eb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_ + +#include <utility> + +#include "lib/jxl/patch_dictionary_internal.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Draws patches if applicable. +std::unique_ptr<RenderPipelineStage> GetPatchesStage( + const PatchDictionary* patches, size_t num_channels); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc new file mode 100644 index 0000000000..4a0529ce2c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc @@ -0,0 +1,62 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_splines.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_splines.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class SplineStage : public RenderPipelineStage { + public: + explicit SplineStage(const Splines* splines) + : RenderPipelineStage(RenderPipelineStage::Settings()), + splines_(*splines) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + float* row_x = GetInputRow(input_rows, 0, 0); + float* row_y = GetInputRow(input_rows, 1, 0); + float* row_b = GetInputRow(input_rows, 2, 0); + splines_.AddToRow(row_x, row_y, row_b, Rect(xpos, ypos, xsize, 1)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Splines"; } + + private: + const Splines& splines_; +}; + +std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) { + return jxl::make_unique<SplineStage>(splines); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetSplineStage); + +std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) { + return HWY_DYNAMIC_DISPATCH(GetSplineStage)(splines); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h new file mode 100644 index 0000000000..363af393ec --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_ + +#include <utility> + +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +// Draws splines if applicable. +std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc new file mode 100644 index 0000000000..a43cb4e1ab --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc @@ -0,0 +1,51 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_spot.h" + +namespace jxl { +class SpotColorStage : public RenderPipelineStage { + public: + explicit SpotColorStage(size_t spot_c, const float* spot_color) + : RenderPipelineStage(RenderPipelineStage::Settings()), + spot_c_(spot_c), + spot_color_(spot_color) { + JXL_ASSERT(spot_c_ >= 3); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + // TODO(veluca): add SIMD. + float scale = spot_color_[3]; + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0); + const float* JXL_RESTRICT s = GetInputRow(input_rows, spot_c_, 0); + for (ssize_t x = -xextra; x < ssize_t(xsize + xextra); x++) { + float mix = scale * s[x]; + p[x] = mix * spot_color_[c] + (1.0f - mix) * p[x]; + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : c == spot_c_ ? RenderPipelineChannelMode::kInput + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Spot"; } + + private: + size_t spot_c_; + const float* spot_color_; +}; + +std::unique_ptr<RenderPipelineStage> GetSpotColorStage( + size_t spot_c, const float* spot_color) { + return jxl::make_unique<SpotColorStage>(spot_c, spot_color); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h new file mode 100644 index 0000000000..3e79c75823 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_ + +#include <utility> + +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Render the spot color channels. +std::unique_ptr<RenderPipelineStage> GetSpotColorStage(size_t spot_c, + const float* spot_color); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc new file mode 100644 index 0000000000..85eca2f039 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc @@ -0,0 +1,203 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_to_linear.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_to_linear.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/cms/tone_mapping-inl.h" +#include "lib/jxl/cms/transfer_functions-inl.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::IfThenZeroElse; + +template <typename Op> +struct PerChannelOp { + explicit PerChannelOp(Op op) : op(op) {} + template <typename D, typename T> + void Transform(D d, T* r, T* g, T* b) const { + *r = op.Transform(d, *r); + *g = op.Transform(d, *g); + *b = op.Transform(d, *b); + } + + Op op; +}; +template <typename Op> +PerChannelOp<Op> MakePerChannelOp(Op&& op) { + return PerChannelOp<Op>(std::forward<Op>(op)); +} + +struct OpLinear { + template <typename D, typename T> + T Transform(D d, const T& encoded) const { + return encoded; + } +}; + +struct OpRgb { + template <typename D, typename T> + T Transform(D d, const T& encoded) const { + return TF_SRGB().DisplayFromEncoded(encoded); + } +}; + +struct OpPq { + explicit OpPq(const float intensity_target) : tf_pq_(intensity_target) {} + template <typename D, typename T> + T Transform(D d, const T& encoded) const { + return tf_pq_.DisplayFromEncoded(d, encoded); + } + TF_PQ tf_pq_; +}; + +struct OpHlg { + explicit OpHlg(const float luminances[3], const float intensity_target) + : hlg_ootf_(HlgOOTF::FromSceneLight( + /*display_luminance=*/intensity_target, luminances)) {} + + template <typename D, typename T> + void Transform(D d, T* r, T* g, T* b) const { + for (T* val : {r, g, b}) { + HWY_ALIGN float vals[MaxLanes(d)]; + Store(*val, d, vals); + for (size_t i = 0; i < Lanes(d); ++i) { + vals[i] = TF_HLG_Base::DisplayFromEncoded(vals[i]); + } + *val = Load(d, vals); + } + hlg_ootf_.Apply(r, g, b); + } + HlgOOTF hlg_ootf_; +}; + +struct Op709 { + template <typename D, typename T> + T Transform(D d, const T& encoded) const { + return TF_709().DisplayFromEncoded(d, encoded); + } +}; + +struct OpGamma { + const float gamma; + template <typename D, typename T> + T Transform(D d, const T& encoded) const { + return IfThenZeroElse(Le(encoded, Set(d, 1e-5f)), + FastPowf(d, encoded, Set(d, gamma))); + } +}; + +struct OpInvalid { + template <typename D, typename T> + void Transform(D d, T* r, T* g, T* b) const {} +}; + +template <typename Op> +class ToLinearStage : public RenderPipelineStage { + public: + explicit ToLinearStage(Op op) + : RenderPipelineStage(RenderPipelineStage::Settings()), + op_(std::move(op)) {} + + explicit ToLinearStage() + : RenderPipelineStage(RenderPipelineStage::Settings()), valid_(false) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + const HWY_FULL(float) d; + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + auto r = LoadU(d, row0 + x); + auto g = LoadU(d, row1 + x); + auto b = LoadU(d, row2 + x); + op_.Transform(d, &r, &g, &b); + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ToLinear"; } + + private: + Status IsInitialized() const override { return valid_; } + + Op op_; + bool valid_ = true; +}; + +template <typename Op> +std::unique_ptr<ToLinearStage<Op>> MakeToLinearStage(Op&& op) { + return jxl::make_unique<ToLinearStage<Op>>(std::forward<Op>(op)); +} + +std::unique_ptr<RenderPipelineStage> GetToLinearStage( + const OutputEncodingInfo& output_encoding_info) { + const auto& tf = output_encoding_info.color_encoding.Tf(); + if (tf.IsLinear()) { + return MakeToLinearStage(MakePerChannelOp(OpLinear())); + } else if (tf.IsSRGB()) { + return MakeToLinearStage(MakePerChannelOp(OpRgb())); + } else if (tf.IsPQ()) { + return MakeToLinearStage( + MakePerChannelOp(OpPq(output_encoding_info.orig_intensity_target))); + } else if (tf.IsHLG()) { + return MakeToLinearStage(OpHlg(output_encoding_info.luminances, + output_encoding_info.orig_intensity_target)); + } else if (tf.Is709()) { + return MakeToLinearStage(MakePerChannelOp(Op709())); + } else if (tf.have_gamma || tf.IsDCI()) { + return MakeToLinearStage( + MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma})); + } else { + return jxl::make_unique<ToLinearStage<OpInvalid>>(); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetToLinearStage); + +std::unique_ptr<RenderPipelineStage> GetToLinearStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetToLinearStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h new file mode 100644 index 0000000000..ccee7b09f0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_ + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from `output_encoding_info.color_encoding` to +// linear. +std::unique_ptr<RenderPipelineStage> GetToLinearStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc new file mode 100644 index 0000000000..2a272e15dc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc @@ -0,0 +1,147 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_tone_mapping.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_tone_mapping.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/cms/tone_mapping-inl.h" +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class ToneMappingStage : public RenderPipelineStage { + public: + explicit ToneMappingStage(OutputEncodingInfo output_encoding_info) + : RenderPipelineStage(RenderPipelineStage::Settings()), + output_encoding_info_(std::move(output_encoding_info)) { + if (output_encoding_info_.desired_intensity_target == + output_encoding_info_.orig_intensity_target) { + // No tone mapping requested. + return; + } + const auto& orig_tf = output_encoding_info_.orig_color_encoding.Tf(); + const auto& dest_tf = output_encoding_info_.color_encoding.Tf(); + if (orig_tf.IsPQ() && output_encoding_info_.desired_intensity_target < + output_encoding_info_.orig_intensity_target) { + tone_mapper_ = jxl::make_unique<ToneMapper>( + /*source_range=*/std::pair<float, float>( + 0, output_encoding_info_.orig_intensity_target), + /*target_range=*/ + std::pair<float, float>( + 0, output_encoding_info_.desired_intensity_target), + output_encoding_info_.luminances); + } else if (orig_tf.IsHLG() && !dest_tf.IsHLG()) { + hlg_ootf_ = jxl::make_unique<HlgOOTF>( + /*source_luminance=*/output_encoding_info_.orig_intensity_target, + /*target_luminance=*/output_encoding_info_.desired_intensity_target, + output_encoding_info_.luminances); + } + + if (dest_tf.IsPQ() && (tone_mapper_ || hlg_ootf_)) { + to_intensity_target_ = + 10000.f / output_encoding_info_.orig_intensity_target; + from_desired_intensity_target_ = + output_encoding_info_.desired_intensity_target / 10000.f; + } + } + + bool IsNeeded() const { return tone_mapper_ || hlg_ootf_; } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + if (!(tone_mapper_ || hlg_ootf_)) return; + + const HWY_FULL(float) d; + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + auto r = LoadU(d, row0 + x); + auto g = LoadU(d, row1 + x); + auto b = LoadU(d, row2 + x); + if (tone_mapper_ || hlg_ootf_) { + r = Mul(r, Set(d, to_intensity_target_)); + g = Mul(g, Set(d, to_intensity_target_)); + b = Mul(b, Set(d, to_intensity_target_)); + if (tone_mapper_) { + tone_mapper_->ToneMap(&r, &g, &b); + } else { + JXL_ASSERT(hlg_ootf_); + hlg_ootf_->Apply(&r, &g, &b); + } + if (tone_mapper_ || hlg_ootf_->WarrantsGamutMapping()) { + GamutMap(&r, &g, &b, output_encoding_info_.luminances); + } + r = Mul(r, Set(d, from_desired_intensity_target_)); + g = Mul(g, Set(d, from_desired_intensity_target_)); + b = Mul(b, Set(d, from_desired_intensity_target_)); + } + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ToneMapping"; } + + private: + using ToneMapper = Rec2408ToneMapper<HWY_FULL(float)>; + OutputEncodingInfo output_encoding_info_; + std::unique_ptr<ToneMapper> tone_mapper_; + std::unique_ptr<HlgOOTF> hlg_ootf_; + // When the target colorspace is PQ, 1 represents 10000 nits instead of + // orig_intensity_target. This temporarily changes this if the tone mappers + // require it. + float to_intensity_target_ = 1.f; + float from_desired_intensity_target_ = 1.f; +}; + +std::unique_ptr<RenderPipelineStage> GetToneMappingStage( + const OutputEncodingInfo& output_encoding_info) { + auto stage = jxl::make_unique<ToneMappingStage>(output_encoding_info); + if (!stage->IsNeeded()) return nullptr; + return stage; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetToneMappingStage); + +std::unique_ptr<RenderPipelineStage> GetToneMappingStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetToneMappingStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h new file mode 100644 index 0000000000..57eb9a9abf --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_ +#include <math.h> +#include <stdint.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Tone maps the image if appropriate. It must be in linear space and +// `output_encoding_info.luminances` must contain the luminance for the +// primaries of that space. It must also be encoded such that (1, 1, 1) +// represents `output_encoding_info.orig_intensity_target` nits, unless +// `output_encoding_info.color_encoding.tf.IsPQ()`, in which case (1, 1, 1) must +// represent 10000 nits. This corresponds to what XYBStage outputs. After this +// stage, (1, 1, 1) will represent +// `output_encoding_info.desired_intensity_target` nits, except in the PQ +// special case in which it remains 10000. +// +// If no tone mapping is necessary, this will return nullptr. +std::unique_ptr<RenderPipelineStage> GetToneMappingStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc new file mode 100644 index 0000000000..ade37d59a6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc @@ -0,0 +1,192 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_upsampling.h" + +#include "lib/jxl/base/status.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/simd_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::MulAdd; + +class UpsamplingStage : public RenderPipelineStage { + public: + explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c, + size_t shift) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/shift, /*border=*/2)), + c_(c) { + const float* weights = shift == 1 ? ups_factors.upsampling2_weights + : shift == 2 ? ups_factors.upsampling4_weights + : ups_factors.upsampling8_weights; + size_t N = 1 << (shift - 1); + for (size_t i = 0; i < 5 * N; i++) { + for (size_t j = 0; j < 5 * N; j++) { + size_t y = std::min(i, j); + size_t x = std::max(i, j); + kernel_[j / 5][i / 5][j % 5][i % 5] = + weights[5 * N * y - y * (y - 1) / 2 + x - y]; + } + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + static HWY_FULL(float) df; + size_t shift = settings_.shift_x; + size_t N = 1 << shift; + const size_t xsize_v = RoundUpTo(xsize, Lanes(df)); + for (ssize_t iy = -2; iy <= 2; iy++) { + msan::UnpoisonMemory(GetInputRow(input_rows, c_, iy) + xsize + 2, + sizeof(float) * (xsize_v - xsize)); + } + JXL_ASSERT(xextra == 0); + ssize_t x0 = 0; + ssize_t x1 = xsize; + if (N == 2) { + ProcessRowImpl<2>(input_rows, output_rows, x0, x1); + } + if (N == 4) { + ProcessRowImpl<4>(input_rows, output_rows, x0, x1); + } + if (N == 8) { + ProcessRowImpl<8>(input_rows, output_rows, x0, x1); + } + for (size_t oy = 0; oy < N; oy++) { + float* dst_row = GetOutputRow(output_rows, c_, oy); + msan::PoisonMemory(dst_row + xsize * N, + sizeof(float) * (xsize_v - xsize) * N); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c == c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Upsample"; } + + private: + template <size_t N> + JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const { + ix += 2; + iy += 2; + if (N == 2) { + return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix]; + } + if (N == 4) { + return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2] + [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy] + [x % 4 < 2 ? ix : 4 - ix]; + } + if (N == 8) { + return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4] + [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy] + [x % 8 < 4 ? ix : 4 - ix]; + } + JXL_UNREACHABLE("Invalid upsample"); + } + + template <ssize_t N> + void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows, + ssize_t x0, ssize_t x1) const { + static HWY_FULL(float) df; + using V = hwy::HWY_NAMESPACE::Vec<HWY_FULL(float)>; + V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7; + (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7; + // Once we have C++17 available, change this back to `V* ups[N]` and + // initialize using `if constexpr` below. + V* ups[8] = {}; + static_assert(N == 2 || N == 4 || N == 8, "N must be 2, 4, or 8"); + if (N >= 2) { + ups[0] = &ups0; + ups[1] = &ups1; + } + if (N >= 4) { + ups[2] = &ups2; + ups[3] = &ups3; + } + if (N == 8) { + ups[4] = &ups4; + ups[5] = &ups5; + ups[6] = &ups6; + ups[7] = &ups7; + } + + for (size_t oy = 0; oy < N; oy++) { + float* dst_row = GetOutputRow(output_rows, c_, oy); + for (ssize_t x = x0; x < x1; x += Lanes(df)) { + for (size_t ox = 0; ox < N; ox++) { + auto result = Zero(df); + auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x); + auto max = min; + for (ssize_t iy = -2; iy <= 2; iy++) { + for (ssize_t ix = -2; ix <= 2; ix++) { + auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix); + result = MulAdd(Set(df, Kernel<N>(ox, oy, ix, iy)), v, result); + min = Min(v, min); + max = Max(v, max); + } + } + // Avoid overshooting. + *ups[ox] = Clamp(result, min, max); + } + if (N == 2) { + StoreInterleaved(df, ups0, ups1, dst_row + x * N); + } + if (N == 4) { + StoreInterleaved(df, ups0, ups1, ups2, ups3, dst_row + x * N); + } + if (N == 8) { + StoreInterleaved(df, ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7, + dst_row + x * N); + } + } + } + } + + size_t c_; + float kernel_[4][4][5][5]; +}; + +std::unique_ptr<RenderPipelineStage> GetUpsamplingStage( + const CustomTransformData& ups_factors, size_t c, size_t shift) { + return jxl::make_unique<UpsamplingStage>(ups_factors, c, shift); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetUpsamplingStage); + +std::unique_ptr<RenderPipelineStage> GetUpsamplingStage( + const CustomTransformData& ups_factors, size_t c, size_t shift) { + JXL_ASSERT(shift != 0); + JXL_ASSERT(shift <= 3); + return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h new file mode 100644 index 0000000000..7d5defd23c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ +#include <math.h> +#include <stdint.h> +#include <stdio.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Upsamples the given channel by the given factor. +std::unique_ptr<RenderPipelineStage> GetUpsamplingStage( + const CustomTransformData& ups_factors, size_t c, size_t shift); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc new file mode 100644 index 0000000000..847972acc8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc @@ -0,0 +1,671 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_write.h" + +#include <type_traits> + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/common.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/sanitizers.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_write.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::NearestInt; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftLeftSame; +using hwy::HWY_NAMESPACE::ShiftRightSame; +using hwy::HWY_NAMESPACE::VFromD; + +// 8x8 ordered dithering pattern from +// https://en.wikipedia.org/wiki/Ordered_dithering +// scaled to have an average of 0 and be fully contained in (-0.5, 0.5). +// Matrix is duplicated in width to avoid inconsistencies or out-of-bound-reads +// if doing unaligned operations. +const float kDither[(2 * 8) * 8] = { + -0.4921875, 0.0078125, -0.3671875, 0.1328125, // + -0.4609375, 0.0390625, -0.3359375, 0.1640625, // + -0.4921875, 0.0078125, -0.3671875, 0.1328125, // + -0.4609375, 0.0390625, -0.3359375, 0.1640625, // + // + 0.2578125, -0.2421875, 0.3828125, -0.1171875, // + 0.2890625, -0.2109375, 0.4140625, -0.0859375, // + 0.2578125, -0.2421875, 0.3828125, -0.1171875, // + 0.2890625, -0.2109375, 0.4140625, -0.0859375, // + // + -0.3046875, 0.1953125, -0.4296875, 0.0703125, // + -0.2734375, 0.2265625, -0.3984375, 0.1015625, // + -0.3046875, 0.1953125, -0.4296875, 0.0703125, // + -0.2734375, 0.2265625, -0.3984375, 0.1015625, // + // + 0.4453125, -0.0546875, 0.3203125, -0.1796875, // + 0.4765625, -0.0234375, 0.3515625, -0.1484375, // + 0.4453125, -0.0546875, 0.3203125, -0.1796875, // + 0.4765625, -0.0234375, 0.3515625, -0.1484375, // + // + -0.4453125, 0.0546875, -0.3203125, 0.1796875, // + -0.4765625, 0.0234375, -0.3515625, 0.1484375, // + -0.4453125, 0.0546875, -0.3203125, 0.1796875, // + -0.4765625, 0.0234375, -0.3515625, 0.1484375, // + // + 0.3046875, -0.1953125, 0.4296875, -0.0703125, // + 0.2734375, -0.2265625, 0.3984375, -0.1015625, // + 0.3046875, -0.1953125, 0.4296875, -0.0703125, // + 0.2734375, -0.2265625, 0.3984375, -0.1015625, // + // + -0.2578125, 0.2421875, -0.3828125, 0.1171875, // + -0.2890625, 0.2109375, -0.4140625, 0.0859375, // + -0.2578125, 0.2421875, -0.3828125, 0.1171875, // + -0.2890625, 0.2109375, -0.4140625, 0.0859375, // + // + 0.4921875, -0.0078125, 0.3671875, -0.1328125, // + 0.4609375, -0.0390625, 0.3359375, -0.1640625, // + 0.4921875, -0.0078125, 0.3671875, -0.1328125, // + 0.4609375, -0.0390625, 0.3359375, -0.1640625, // +}; + +using DF = HWY_FULL(float); + +// Converts `v` to an appropriate value for the given unsigned type. +// If the unsigned type is an 8-bit type, performs ordered dithering. +template <typename T> +VFromD<Rebind<T, DF>> MakeUnsigned(VFromD<DF> v, size_t x0, size_t y0, + VFromD<DF> mul) { + static_assert(std::is_unsigned<T>::value, "T must be an unsigned type"); + using DU = Rebind<T, DF>; + v = Mul(v, mul); + // TODO(veluca): if constexpr with C++17 + if (sizeof(T) == 1) { + size_t pos = (y0 % 8) * (2 * 8) + (x0 % 8); +#if HWY_TARGET != HWY_SCALAR + auto dither = LoadDup128(DF(), kDither + pos); +#else + auto dither = LoadU(DF(), kDither + pos); +#endif + v = Add(v, dither); + } + v = Clamp(Zero(DF()), v, mul); + return DemoteTo(DU(), NearestInt(v)); +} + +class WriteToOutputStage : public RenderPipelineStage { + public: + WriteToOutputStage(const ImageOutput& main_output, size_t width, + size_t height, bool has_alpha, bool unpremul_alpha, + size_t alpha_c, Orientation undo_orientation, + const std::vector<ImageOutput>& extra_output) + : RenderPipelineStage(RenderPipelineStage::Settings()), + width_(width), + height_(height), + main_(main_output), + num_color_(main_.num_channels_ < 3 ? 1 : 3), + want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4), + has_alpha_(has_alpha), + unpremul_alpha_(unpremul_alpha), + alpha_c_(alpha_c), + flip_x_(ShouldFlipX(undo_orientation)), + flip_y_(ShouldFlipY(undo_orientation)), + transpose_(ShouldTranspose(undo_orientation)), + opaque_alpha_(kMaxPixelsPerCall, 1.0f) { + for (size_t ec = 0; ec < extra_output.size(); ++ec) { + if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) { + Output extra(extra_output[ec]); + extra.channel_index_ = 3 + ec; + extra_channels_.push_back(extra); + } + } + } + + WriteToOutputStage(const WriteToOutputStage&) = delete; + WriteToOutputStage& operator=(const WriteToOutputStage&) = delete; + WriteToOutputStage(WriteToOutputStage&&) = delete; + WriteToOutputStage& operator=(WriteToOutputStage&&) = delete; + + ~WriteToOutputStage() override { + if (main_.run_opaque_) { + main_.pixel_callback_.destroy(main_.run_opaque_); + } + for (auto& extra : extra_channels_) { + if (extra.run_opaque_) { + extra.pixel_callback_.destroy(extra.run_opaque_); + } + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + JXL_DASSERT(xextra == 0); + JXL_DASSERT(main_.run_opaque_ || main_.buffer_); + if (ypos >= height_) return; + if (xpos >= width_) return; + if (flip_y_) { + ypos = height_ - 1u - ypos; + } + size_t limit = std::min(xsize, width_ - xpos); + for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) { + size_t xstart = xpos + x0; + size_t len = std::min<size_t>(kMaxPixelsPerCall, limit - x0); + + const float* line_buffers[4]; + for (size_t c = 0; c < num_color_; c++) { + line_buffers[c] = GetInputRow(input_rows, c, 0) + x0; + } + if (has_alpha_) { + line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0; + } else { + // opaque_alpha_ is a way to set all values to 1.0f. + line_buffers[num_color_] = opaque_alpha_.data(); + } + if (has_alpha_ && want_alpha_ && unpremul_alpha_) { + UnpremulAlpha(thread_id, len, line_buffers); + } + OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers); + for (const auto& extra : extra_channels_) { + line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0; + OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + if (c < num_color_ || (has_alpha_ && c == alpha_c_)) { + return RenderPipelineChannelMode::kInput; + } + for (const auto& extra : extra_channels_) { + if (c == extra.channel_index_) { + return RenderPipelineChannelMode::kInput; + } + } + return RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "WritePixelCB"; } + + private: + struct Output { + Output(const ImageOutput& image_out) + : pixel_callback_(image_out.callback), + buffer_(image_out.buffer), + buffer_size_(image_out.buffer_size), + stride_(image_out.stride), + num_channels_(image_out.format.num_channels), + swap_endianness_(SwapEndianness(image_out.format.endianness)), + data_type_(image_out.format.data_type), + bits_per_sample_(image_out.bits_per_sample) {} + + Status PrepareForThreads(size_t num_threads) { + if (pixel_callback_.IsPresent()) { + run_opaque_ = + pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall); + JXL_RETURN_IF_ERROR(run_opaque_ != nullptr); + } else { + JXL_RETURN_IF_ERROR(buffer_ != nullptr); + } + return true; + } + + PixelCallback pixel_callback_; + void* run_opaque_ = nullptr; + void* buffer_ = nullptr; + size_t buffer_size_; + size_t stride_; + size_t num_channels_; + bool swap_endianness_; + JxlDataType data_type_; + size_t bits_per_sample_; + size_t channel_index_; // used for extra_channels + }; + + Status PrepareForThreads(size_t num_threads) override { + JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads)); + for (auto& extra : extra_channels_) { + JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads)); + } + temp_out_.resize(num_threads); + for (CacheAlignedUniquePtr& temp : temp_out_) { + temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall * + main_.num_channels_); + } + if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) { + temp_in_.resize(num_threads * main_.num_channels_); + for (CacheAlignedUniquePtr& temp : temp_in_) { + temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall); + } + } + return true; + } + static bool ShouldFlipX(Orientation undo_orientation) { + return (undo_orientation == Orientation::kFlipHorizontal || + undo_orientation == Orientation::kRotate180 || + undo_orientation == Orientation::kRotate270 || + undo_orientation == Orientation::kAntiTranspose); + } + static bool ShouldFlipY(Orientation undo_orientation) { + return (undo_orientation == Orientation::kFlipVertical || + undo_orientation == Orientation::kRotate180 || + undo_orientation == Orientation::kRotate90 || + undo_orientation == Orientation::kAntiTranspose); + } + static bool ShouldTranspose(Orientation undo_orientation) { + return (undo_orientation == Orientation::kTranspose || + undo_orientation == Orientation::kRotate90 || + undo_orientation == Orientation::kRotate270 || + undo_orientation == Orientation::kAntiTranspose); + } + + void UnpremulAlpha(size_t thread_id, size_t len, + const float** line_buffers) const { + const HWY_FULL(float) d; + auto one = Set(d, 1.0f); + float* temp_in[4]; + for (size_t c = 0; c < main_.num_channels_; ++c) { + size_t tix = thread_id * main_.num_channels_ + c; + temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get()); + memcpy(temp_in[c], line_buffers[c], sizeof(float) * len); + } + auto small_alpha = Set(d, kSmallAlpha); + for (size_t ix = 0; ix < len; ix += Lanes(d)) { + auto alpha = LoadU(d, temp_in[num_color_] + ix); + auto mul = Div(one, Max(small_alpha, alpha)); + for (size_t c = 0; c < num_color_; ++c) { + auto val = LoadU(d, temp_in[c] + ix); + StoreU(Mul(val, mul), d, temp_in[c] + ix); + } + } + for (size_t c = 0; c < main_.num_channels_; ++c) { + line_buffers[c] = temp_in[c]; + } + } + + void OutputBuffers(const Output& out, size_t thread_id, size_t ypos, + size_t xstart, size_t len, const float* input[4]) const { + if (flip_x_) { + FlipX(out, thread_id, len, &xstart, input); + } + if (out.data_type_ == JXL_TYPE_UINT8) { + uint8_t* JXL_RESTRICT temp = + reinterpret_cast<uint8_t*>(temp_out_[thread_id].get()); + StoreUnsignedRow(out, input, len, temp, xstart, ypos); + WriteToOutput(out, thread_id, ypos, xstart, len, temp); + } else if (out.data_type_ == JXL_TYPE_UINT16 || + out.data_type_ == JXL_TYPE_FLOAT16) { + uint16_t* JXL_RESTRICT temp = + reinterpret_cast<uint16_t*>(temp_out_[thread_id].get()); + if (out.data_type_ == JXL_TYPE_UINT16) { + StoreUnsignedRow(out, input, len, temp, xstart, ypos); + } else { + StoreFloat16Row(out, input, len, temp); + } + if (out.swap_endianness_) { + const HWY_FULL(uint16_t) du; + size_t output_len = len * out.num_channels_; + for (size_t j = 0; j < output_len; j += Lanes(du)) { + auto v = LoadU(du, temp + j); + auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); + StoreU(vswap, du, temp + j); + } + } + WriteToOutput(out, thread_id, ypos, xstart, len, temp); + } else if (out.data_type_ == JXL_TYPE_FLOAT) { + float* JXL_RESTRICT temp = + reinterpret_cast<float*>(temp_out_[thread_id].get()); + StoreFloatRow(out, input, len, temp); + if (out.swap_endianness_) { + size_t output_len = len * out.num_channels_; + for (size_t j = 0; j < output_len; ++j) { + temp[j] = BSwapFloat(temp[j]); + } + } + WriteToOutput(out, thread_id, ypos, xstart, len, temp); + } + } + + void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart, + const float** line_buffers) const { + float* temp_in[4]; + for (size_t c = 0; c < out.num_channels_; ++c) { + size_t tix = thread_id * main_.num_channels_ + c; + temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get()); + if (temp_in[c] != line_buffers[c]) { + memcpy(temp_in[c], line_buffers[c], sizeof(float) * len); + } + } + size_t last = (len - 1u); + size_t num = (len / 2); + for (size_t i = 0; i < num; ++i) { + for (size_t c = 0; c < out.num_channels_; ++c) { + std::swap(temp_in[c][i], temp_in[c][last - i]); + } + } + for (size_t c = 0; c < out.num_channels_; ++c) { + line_buffers[c] = temp_in[c]; + } + *xstart = width_ - *xstart - len; + } + + template <typename T> + void StoreUnsignedRow(const Output& out, const float* input[4], size_t len, + T* output, size_t xstart, size_t ypos) const { + const HWY_FULL(float) d; + auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1); + const Rebind<T, decltype(d)> du; + const size_t padding = RoundUpTo(len, Lanes(d)) - len; + for (size_t c = 0; c < out.num_channels_; ++c) { + msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding); + } + if (out.num_channels_ == 1) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreU(MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul), + du, &output[i]); + } + } else if (out.num_channels_ == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved2( + MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul), + MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul), du, + &output[2 * i]); + } + } else if (out.num_channels_ == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved3( + MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul), + MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul), + MakeUnsigned<T>(LoadU(d, &input[2][i]), xstart + i, ypos, mul), du, + &output[3 * i]); + } + } else if (out.num_channels_ == 4) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved4( + MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul), + MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul), + MakeUnsigned<T>(LoadU(d, &input[2][i]), xstart + i, ypos, mul), + MakeUnsigned<T>(LoadU(d, &input[3][i]), xstart + i, ypos, mul), du, + &output[4 * i]); + } + } + msan::PoisonMemory(output + out.num_channels_ * len, + sizeof(output[0]) * out.num_channels_ * padding); + } + + void StoreFloat16Row(const Output& out, const float* input[4], size_t len, + uint16_t* output) const { + const HWY_FULL(float) d; + const Rebind<uint16_t, decltype(d)> du; + const Rebind<hwy::float16_t, decltype(d)> df16; + const size_t padding = RoundUpTo(len, Lanes(d)) - len; + for (size_t c = 0; c < out.num_channels_; ++c) { + msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding); + } + if (out.num_channels_ == 1) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]); + } + } else if (out.num_channels_ == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + auto v1 = LoadU(d, &input[1][i]); + StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)), + BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]); + } + } else if (out.num_channels_ == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + auto v1 = LoadU(d, &input[1][i]); + auto v2 = LoadU(d, &input[2][i]); + StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)), + BitCast(du, DemoteTo(df16, v1)), + BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]); + } + } else if (out.num_channels_ == 4) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + auto v1 = LoadU(d, &input[1][i]); + auto v2 = LoadU(d, &input[2][i]); + auto v3 = LoadU(d, &input[3][i]); + StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)), + BitCast(du, DemoteTo(df16, v1)), + BitCast(du, DemoteTo(df16, v2)), + BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]); + } + } + msan::PoisonMemory(output + out.num_channels_ * len, + sizeof(output[0]) * out.num_channels_ * padding); + } + + void StoreFloatRow(const Output& out, const float* input[4], size_t len, + float* output) const { + const HWY_FULL(float) d; + if (out.num_channels_ == 1) { + memcpy(output, input[0], len * sizeof(output[0])); + } else if (out.num_channels_ == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d, + &output[2 * i]); + } + } else if (out.num_channels_ == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), + LoadU(d, &input[2][i]), d, &output[3 * i]); + } + } else { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), + LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d, + &output[4 * i]); + } + } + } + + template <typename T> + void WriteToOutput(const Output& out, size_t thread_id, size_t ypos, + size_t xstart, size_t len, T* output) const { + if (transpose_) { + // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD. + if (out.run_opaque_) { + for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) { + out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i, + 1, output + j); + } + } else { + const size_t pixel_stride = out.num_channels_ * sizeof(T); + const size_t offset = xstart * out.stride_ + ypos * pixel_stride; + for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) { + const size_t ix = offset + i * out.stride_; + JXL_DASSERT(ix + pixel_stride <= out.buffer_size_); + memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + ix, output + j, + pixel_stride); + } + } + } else { + if (out.run_opaque_) { + out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len, + output); + } else { + const size_t pixel_stride = out.num_channels_ * sizeof(T); + const size_t offset = ypos * out.stride_ + xstart * pixel_stride; + JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_); + memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + offset, output, + len * pixel_stride); + } + } + } + + static constexpr size_t kMaxPixelsPerCall = 1024; + size_t width_; + size_t height_; + Output main_; // color + alpha + size_t num_color_; + bool want_alpha_; + bool has_alpha_; + bool unpremul_alpha_; + size_t alpha_c_; + bool flip_x_; + bool flip_y_; + bool transpose_; + std::vector<Output> extra_channels_; + std::vector<float> opaque_alpha_; + std::vector<CacheAlignedUniquePtr> temp_in_; + std::vector<CacheAlignedUniquePtr> temp_out_; +}; + +constexpr size_t WriteToOutputStage::kMaxPixelsPerCall; + +std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage( + const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, + bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation, + std::vector<ImageOutput>& extra_output) { + return jxl::make_unique<WriteToOutputStage>( + main_output, width, height, has_alpha, unpremul_alpha, alpha_c, + undo_orientation, extra_output); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jxl { + +HWY_EXPORT(GetWriteToOutputStage); + +namespace { +class WriteToImageBundleStage : public RenderPipelineStage { + public: + explicit WriteToImageBundleStage(ImageBundle* image_bundle, + ColorEncoding color_encoding) + : RenderPipelineStage(RenderPipelineStage::Settings()), + image_bundle_(image_bundle), + color_encoding_(std::move(color_encoding)) {} + + void SetInputSizes( + const std::vector<std::pair<size_t, size_t>>& input_sizes) override { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(input_sizes.size() >= 3); + for (size_t c = 1; c < input_sizes.size(); c++) { + JXL_ASSERT(input_sizes[c].first == input_sizes[0].first); + JXL_ASSERT(input_sizes[c].second == input_sizes[0].second); + } +#endif + // TODO(eustas): what should we do in the case of "want only ECs"? + image_bundle_->SetFromImage( + Image3F(input_sizes[0].first, input_sizes[0].second), color_encoding_); + // TODO(veluca): consider not reallocating ECs if not needed. + image_bundle_->extra_channels().clear(); + for (size_t c = 3; c < input_sizes.size(); c++) { + image_bundle_->extra_channels().emplace_back(input_sizes[c].first, + input_sizes[c].second); + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < 3; c++) { + memcpy(image_bundle_->color()->PlaneRow(c, ypos) + xpos - xextra, + GetInputRow(input_rows, c, 0) - xextra, + sizeof(float) * (xsize + 2 * xextra)); + } + for (size_t ec = 0; ec < image_bundle_->extra_channels().size(); ec++) { + JXL_ASSERT(image_bundle_->extra_channels()[ec].xsize() >= + xpos + xsize + xextra); + memcpy(image_bundle_->extra_channels()[ec].Row(ypos) + xpos - xextra, + GetInputRow(input_rows, 3 + ec, 0) - xextra, + sizeof(float) * (xsize + 2 * xextra)); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInput; + } + + const char* GetName() const override { return "WriteIB"; } + + private: + ImageBundle* image_bundle_; + ColorEncoding color_encoding_; +}; + +class WriteToImage3FStage : public RenderPipelineStage { + public: + explicit WriteToImage3FStage(Image3F* image) + : RenderPipelineStage(RenderPipelineStage::Settings()), image_(image) {} + + void SetInputSizes( + const std::vector<std::pair<size_t, size_t>>& input_sizes) override { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(input_sizes.size() >= 3); + for (size_t c = 1; c < 3; ++c) { + JXL_ASSERT(input_sizes[c].first == input_sizes[0].first); + JXL_ASSERT(input_sizes[c].second == input_sizes[0].second); + } +#endif + *image_ = Image3F(input_sizes[0].first, input_sizes[0].second); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < 3; c++) { + memcpy(image_->PlaneRow(c, ypos) + xpos - xextra, + GetInputRow(input_rows, c, 0) - xextra, + sizeof(float) * (xsize + 2 * xextra)); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInput + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "WriteI3F"; } + + private: + Image3F* image_; +}; + +} // namespace + +std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage( + ImageBundle* image_bundle, ColorEncoding color_encoding) { + return jxl::make_unique<WriteToImageBundleStage>(image_bundle, + std::move(color_encoding)); +} + +std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image) { + return jxl::make_unique<WriteToImage3FStage>(image); +} + +std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage( + const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, + bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation, + std::vector<ImageOutput>& extra_output) { + return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)( + main_output, width, height, has_alpha, unpremul_alpha, alpha_c, + undo_orientation, extra_output); +} + +} // namespace jxl + +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h new file mode 100644 index 0000000000..c5f844ebe8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_ + +#include <functional> + +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage( + ImageBundle* image_bundle, ColorEncoding color_encoding); + +// Gets a stage to write color channels to an Image3F. +std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image); + +// Gets a stage to write to a pixel callback or image buffer. +std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage( + const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, + bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation, + std::vector<ImageOutput>& extra_output); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc new file mode 100644 index 0000000000..56e86e6095 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc @@ -0,0 +1,178 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_xyb.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_xyb.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jxl/base/common.h" +#include "lib/jxl/cms/opsin_params.h" +#include "lib/jxl/common.h" // JXL_HIGH_PRECISION +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class XYBStage : public RenderPipelineStage { + public: + explicit XYBStage(const OutputEncodingInfo& output_encoding_info) + : RenderPipelineStage(RenderPipelineStage::Settings()), + opsin_params_(output_encoding_info.opsin_params), + output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() == + ColorSpace::kXYB) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + const HWY_FULL(float) d; + JXL_ASSERT(xextra == 0); + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + // TODO(eustas): when using frame origin, addresses might be unaligned; + // making them aligned will void performance penalty. + if (output_is_xyb_) { + const auto scale_x = Set(d, jxl::cms::kScaledXYBScale[0]); + const auto scale_y = Set(d, jxl::cms::kScaledXYBScale[1]); + const auto scale_bmy = Set(d, jxl::cms::kScaledXYBScale[2]); + const auto offset_x = Set(d, jxl::cms::kScaledXYBOffset[0]); + const auto offset_y = Set(d, jxl::cms::kScaledXYBOffset[1]); + const auto offset_bmy = Set(d, jxl::cms::kScaledXYBOffset[2]); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto in_x = LoadU(d, row0 + x); + const auto in_y = LoadU(d, row1 + x); + const auto in_b = LoadU(d, row2 + x); + auto out_x = Mul(Add(in_x, offset_x), scale_x); + auto out_y = Mul(Add(in_y, offset_y), scale_y); + auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy); + StoreU(out_x, d, row0 + x); + StoreU(out_y, d, row1 + x); + StoreU(out_b, d, row2 + x); + } + } else { + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto in_opsin_x = LoadU(d, row0 + x); + const auto in_opsin_y = LoadU(d, row1 + x); + const auto in_opsin_b = LoadU(d, row2 + x); + auto r = Undefined(d); + auto g = Undefined(d); + auto b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g, + &b); + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "XYB"; } + + private: + const OpsinParams opsin_params_; + const bool output_is_xyb_; +}; + +std::unique_ptr<RenderPipelineStage> GetXYBStage( + const OutputEncodingInfo& output_encoding_info) { + return jxl::make_unique<XYBStage>(output_encoding_info); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetXYBStage); + +std::unique_ptr<RenderPipelineStage> GetXYBStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info); +} + +#if !JXL_HIGH_PRECISION +namespace { +class FastXYBStage : public RenderPipelineStage { + public: + FastXYBStage(uint8_t* rgb, size_t stride, size_t width, size_t height, + bool rgba, bool has_alpha, size_t alpha_c) + : RenderPipelineStage(RenderPipelineStage::Settings()), + rgb_(rgb), + stride_(stride), + width_(width), + height_(height), + rgba_(rgba), + has_alpha_(has_alpha), + alpha_c_(alpha_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + if (ypos >= height_) return; + JXL_ASSERT(xextra == 0); + const float* xyba[4] = { + GetInputRow(input_rows, 0, 0), GetInputRow(input_rows, 1, 0), + GetInputRow(input_rows, 2, 0), + has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr}; + uint8_t* out_buf = rgb_ + stride_ * ypos + (rgba_ ? 4 : 3) * xpos; + FastXYBTosRGB8(xyba, out_buf, rgba_, + xsize + xpos <= width_ ? xsize : width_ - xpos); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 || (has_alpha_ && c == alpha_c_) + ? RenderPipelineChannelMode::kInput + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "FastXYB"; } + + private: + uint8_t* rgb_; + size_t stride_; + size_t width_; + size_t height_; + bool rgba_; + bool has_alpha_; + size_t alpha_c_; + std::vector<float> opaque_alpha_; +}; + +} // namespace + +std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage( + uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba, + bool has_alpha, size_t alpha_c) { + JXL_ASSERT(HasFastXYBTosRGB8()); + return make_unique<FastXYBStage>(rgb, stride, width, height, rgba, has_alpha, + alpha_c); +} +#endif + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h new file mode 100644 index 0000000000..7b06345c36 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_ +#include <stdint.h> + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from XYB to linear with appropriate primaries. +std::unique_ptr<RenderPipelineStage> GetXYBStage( + const OutputEncodingInfo& output_encoding_info); + +// Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and +// write to a uint8 buffer. +std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage( + uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba, + bool has_alpha, size_t alpha_c); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc new file mode 100644 index 0000000000..30ad327221 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc @@ -0,0 +1,83 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_ycbcr.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_ycbcr.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::MulAdd; + +class kYCbCrStage : public RenderPipelineStage { + public: + kYCbCrStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + const HWY_FULL(float) df; + + // Full-range BT.601 as defined by JFIF Clause 7: + // https://www.itu.int/rec/T-REC-T.871-201105-I/en + const auto c128 = Set(df, 128.0f / 255); + const auto crcr = Set(df, 1.402f); + const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f); + const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f); + const auto cbcb = Set(df, 1.772f); + + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // TODO(eustas): when using frame origin, addresses might be unaligned; + // making them aligned will void performance penalty. + for (size_t x = 0; x < xsize; x += Lanes(df)) { + const auto y_vec = Add(LoadU(df, row1 + x), c128); + const auto cb_vec = LoadU(df, row0 + x); + const auto cr_vec = LoadU(df, row2 + x); + const auto r_vec = MulAdd(crcr, cr_vec, y_vec); + const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec)); + const auto b_vec = MulAdd(cbcb, cb_vec, y_vec); + StoreU(r_vec, df, row0 + x); + StoreU(g_vec, df, row1 + x); + StoreU(b_vec, df, row2 + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "YCbCr"; } +}; + +std::unique_ptr<RenderPipelineStage> GetYCbCrStage() { + return jxl::make_unique<kYCbCrStage>(); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetYCbCrStage); + +std::unique_ptr<RenderPipelineStage> GetYCbCrStage() { + return HWY_DYNAMIC_DISPATCH(GetYCbCrStage)(); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h new file mode 100644 index 0000000000..3e99af7a38 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h @@ -0,0 +1,24 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_ +#include <math.h> +#include <stdint.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from YCbCr to RGB. +std::unique_ptr<RenderPipelineStage> GetYCbCrStage(); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h new file mode 100644 index 0000000000..789a52f8b2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <math.h> +#include <stdint.h> +#include <stdio.h> + +#include <algorithm> +#include <utility> +#include <vector> + +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +class UpsampleXSlowStage : public RenderPipelineStage { + public: + UpsampleXSlowStage() + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(1, 1)) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < input_rows.size(); c++) { + const float* row = GetInputRow(input_rows, c, 0); + float* row_out = GetOutputRow(output_rows, c, 0); + for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) { + float xp = *(row + x - 1); + float xc = *(row + x); + float xn = *(row + x + 1); + float xout0 = xp * 0.25f + xc * 0.75f; + float xout1 = xc * 0.75f + xn * 0.25f; + *(row_out + 2 * x + 0) = xout0; + *(row_out + 2 * x + 1) = xout1; + } + } + } + + const char* GetName() const override { return "TEST::UpsampleXSlowStage"; } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInOut; + } +}; + +class UpsampleYSlowStage : public RenderPipelineStage { + public: + UpsampleYSlowStage() + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(1, 1)) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < input_rows.size(); c++) { + const float* rowp = GetInputRow(input_rows, c, -1); + const float* rowc = GetInputRow(input_rows, c, 0); + const float* rown = GetInputRow(input_rows, c, 1); + float* row_out0 = GetOutputRow(output_rows, c, 0); + float* row_out1 = GetOutputRow(output_rows, c, 1); + for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) { + float xp = *(rowp + x); + float xc = *(rowc + x); + float xn = *(rown + x); + float yout0 = xp * 0.25f + xc * 0.75f; + float yout1 = xc * 0.75f + xn * 0.25f; + *(row_out0 + x) = yout0; + *(row_out1 + x) = yout1; + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInOut; + } + + const char* GetName() const override { return "TEST::UpsampleYSlowStage"; } +}; + +class Check0FinalStage : public RenderPipelineStage { + public: + Check0FinalStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < input_rows.size(); c++) { + for (size_t x = 0; x < xsize; x++) { + JXL_CHECK(fabsf(GetInputRow(input_rows, c, 0)[x]) < 1e-8); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInput; + } + const char* GetName() const override { return "TEST::Check0FinalStage"; } +}; + +} // namespace jxl |