diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/jpeg-xl/lib/jpegli/downsample.cc | 356 |
1 files changed, 356 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jpegli/downsample.cc b/third_party/jpeg-xl/lib/jpegli/downsample.cc new file mode 100644 index 0000000000..df2c156972 --- /dev/null +++ b/third_party/jpeg-xl/lib/jpegli/downsample.cc @@ -0,0 +1,356 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jpegli/downsample.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +#include "lib/jpegli/encode_internal.h" +#include "lib/jpegli/error.h" + +HWY_BEFORE_NAMESPACE(); +namespace jpegli { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::Vec; + +using D = HWY_CAPPED(float, 8); +constexpr D d; + +void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) { + const size_t N = Lanes(d); + const size_t len_out = len / 2; + const auto mul = Set(d, 0.5f); + Vec<D> v0, v1; + for (size_t x = 0; x < len_out; x += N) { + LoadInterleaved2(d, row_in + 2 * x, v0, v1); + Store(Mul(mul, Add(v0, v1)), d, row_out + x); + } +} + +void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) { + const size_t N = Lanes(d); + const size_t len_out = len / 3; + const auto mul = Set(d, 1.0f / 3); + Vec<D> v0, v1, v2; + for (size_t x = 0; x < len_out; x += N) { + LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2); + Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x); + } +} + +void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) { + const size_t N = Lanes(d); + const size_t len_out = len / 4; + const auto mul = Set(d, 0.25f); + Vec<D> v0, v1, v2, v3; + for (size_t x = 0; x < len_out; x += N) { + LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3); + Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x); + } +} + +void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow2x1(rows_in[0], len, row_out); +} + +void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow3x1(rows_in[0], len, row_out); +} + +void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow4x1(rows_in[0], len, row_out); +} + +void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + const size_t N = Lanes(d); + const auto mul = Set(d, 0.5f); + float* row0 = rows_in[0]; + float* row1 = rows_in[1]; + for (size_t x = 0; x < len; x += N) { + Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x); + } +} + +void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + const size_t N = Lanes(d); + const size_t len_out = len / 2; + const auto mul = Set(d, 0.25f); + float* row0 = rows_in[0]; + float* row1 = rows_in[1]; + Vec<D> v0, v1, v2, v3; + for (size_t x = 0; x < len_out; x += N) { + LoadInterleaved2(d, row0 + 2 * x, v0, v1); + LoadInterleaved2(d, row1 + 2 * x, v2, v3); + Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x); + } +} + +void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow3x1(rows_in[0], len, rows_in[0]); + DownsampleRow3x1(rows_in[1], len, rows_in[1]); + Downsample1x2(rows_in, len / 3, row_out); +} + +void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow4x1(rows_in[0], len, rows_in[0]); + DownsampleRow4x1(rows_in[1], len, rows_in[1]); + Downsample1x2(rows_in, len / 4, row_out); +} + +void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + const size_t N = Lanes(d); + const auto mul = Set(d, 1.0f / 3); + float* row0 = rows_in[0]; + float* row1 = rows_in[1]; + float* row2 = rows_in[2]; + for (size_t x = 0; x < len; x += N) { + const auto in0 = Load(d, row0 + x); + const auto in1 = Load(d, row1 + x); + const auto in2 = Load(d, row2 + x); + Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x); + } +} + +void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow2x1(rows_in[0], len, rows_in[0]); + DownsampleRow2x1(rows_in[1], len, rows_in[1]); + DownsampleRow2x1(rows_in[2], len, rows_in[2]); + Downsample1x3(rows_in, len / 2, row_out); +} + +void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow3x1(rows_in[0], len, rows_in[0]); + DownsampleRow3x1(rows_in[1], len, rows_in[1]); + DownsampleRow3x1(rows_in[2], len, rows_in[2]); + Downsample1x3(rows_in, len / 3, row_out); +} + +void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow4x1(rows_in[0], len, rows_in[0]); + DownsampleRow4x1(rows_in[1], len, rows_in[1]); + DownsampleRow4x1(rows_in[2], len, rows_in[2]); + Downsample1x3(rows_in, len / 4, row_out); +} + +void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + const size_t N = Lanes(d); + const auto mul = Set(d, 0.25f); + float* row0 = rows_in[0]; + float* row1 = rows_in[1]; + float* row2 = rows_in[2]; + float* row3 = rows_in[3]; + for (size_t x = 0; x < len; x += N) { + const auto in0 = Load(d, row0 + x); + const auto in1 = Load(d, row1 + x); + const auto in2 = Load(d, row2 + x); + const auto in3 = Load(d, row3 + x); + Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x); + } +} + +void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow2x1(rows_in[0], len, rows_in[0]); + DownsampleRow2x1(rows_in[1], len, rows_in[1]); + DownsampleRow2x1(rows_in[2], len, rows_in[2]); + DownsampleRow2x1(rows_in[3], len, rows_in[3]); + Downsample1x4(rows_in, len / 2, row_out); +} + +void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow3x1(rows_in[0], len, rows_in[0]); + DownsampleRow3x1(rows_in[1], len, rows_in[1]); + DownsampleRow3x1(rows_in[2], len, rows_in[2]); + DownsampleRow3x1(rows_in[3], len, rows_in[3]); + Downsample1x4(rows_in, len / 3, row_out); +} + +void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) { + DownsampleRow4x1(rows_in[0], len, rows_in[0]); + DownsampleRow4x1(rows_in[1], len, rows_in[1]); + DownsampleRow4x1(rows_in[2], len, rows_in[2]); + DownsampleRow4x1(rows_in[3], len, rows_in[3]); + Downsample1x4(rows_in, len / 4, row_out); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jpegli +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jpegli { + +HWY_EXPORT(Downsample1x2); +HWY_EXPORT(Downsample1x3); +HWY_EXPORT(Downsample1x4); +HWY_EXPORT(Downsample2x1); +HWY_EXPORT(Downsample2x2); +HWY_EXPORT(Downsample2x3); +HWY_EXPORT(Downsample2x4); +HWY_EXPORT(Downsample3x1); +HWY_EXPORT(Downsample3x2); +HWY_EXPORT(Downsample3x3); +HWY_EXPORT(Downsample3x4); +HWY_EXPORT(Downsample4x1); +HWY_EXPORT(Downsample4x2); +HWY_EXPORT(Downsample4x3); +HWY_EXPORT(Downsample4x4); + +void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len, + float* row_out) {} + +void ChooseDownsampleMethods(j_compress_ptr cinfo) { + jpeg_comp_master* m = cinfo->master; + for (int c = 0; c < cinfo->num_components; c++) { + m->downsample_method[c] = nullptr; + jpeg_component_info* comp = &cinfo->comp_info[c]; + const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor; + const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor; + if (v_factor == 1) { + if (h_factor == 1) { + m->downsample_method[c] = NullDownsample; + } else if (h_factor == 2) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1); + } else if (h_factor == 3) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1); + } else if (h_factor == 4) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1); + } + } else if (v_factor == 2) { + if (h_factor == 1) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2); + } else if (h_factor == 2) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2); + } else if (h_factor == 3) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2); + } else if (h_factor == 4) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2); + } + } else if (v_factor == 3) { + if (h_factor == 1) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2); + } else if (h_factor == 2) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2); + } else if (h_factor == 3) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2); + } else if (h_factor == 4) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2); + } + } else if (v_factor == 4) { + if (h_factor == 1) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4); + } else if (h_factor == 2) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4); + } else if (h_factor == 3) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4); + } else if (h_factor == 4) { + m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4); + } + } + if (m->downsample_method[c] == nullptr) { + JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor); + } + } +} + +void DownsampleInputBuffer(j_compress_ptr cinfo) { + if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) { + return; + } + jpeg_comp_master* m = cinfo->master; + const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; + const size_t y0 = m->next_iMCU_row * iMCU_height; + const size_t y1 = y0 + iMCU_height; + const size_t xsize_padded = m->xsize_blocks * DCTSIZE; + for (int c = 0; c < cinfo->num_components; c++) { + jpeg_component_info* comp = &cinfo->comp_info[c]; + const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor; + const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor; + if (h_factor == 1 && v_factor == 1) { + continue; + } + auto& input = *m->smooth_input[c]; + auto& output = *m->raw_data[c]; + const size_t yout0 = y0 / v_factor; + float* rows_in[MAX_SAMP_FACTOR]; + for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) { + for (int iy = 0; iy < v_factor; ++iy) { + rows_in[iy] = input.Row(yin + iy); + } + float* row_out = output.Row(yout); + (*m->downsample_method[c])(rows_in, xsize_padded, row_out); + } + } +} + +void ApplyInputSmoothing(j_compress_ptr cinfo) { + if (!cinfo->smoothing_factor) { + return; + } + jpeg_comp_master* m = cinfo->master; + const float kW1 = cinfo->smoothing_factor / 1024.0; + const float kW0 = 1.0f - 8.0f * kW1; + const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; + const ssize_t y0 = m->next_iMCU_row * iMCU_height; + const ssize_t y1 = y0 + iMCU_height; + const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE; + for (int c = 0; c < cinfo->num_components; c++) { + auto& input = m->input_buffer[c]; + auto& output = *m->smooth_input[c]; + if (m->next_iMCU_row == 0) { + input.CopyRow(-1, 0, 1); + } + if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) { + size_t last_row = m->ysize_blocks * DCTSIZE - 1; + input.CopyRow(last_row + 1, last_row, 1); + } + // TODO(szabadka) SIMDify this. + for (ssize_t y = y0; y < y1; ++y) { + const float* row_t = input.Row(y - 1); + const float* row_m = input.Row(y); + const float* row_b = input.Row(y + 1); + float* row_out = output.Row(y); + for (ssize_t x = 0; x < xsize_padded; ++x) { + float val_tl = row_t[x - 1]; + float val_tm = row_t[x]; + float val_tr = row_t[x + 1]; + float val_ml = row_m[x - 1]; + float val_mm = row_m[x]; + float val_mr = row_m[x + 1]; + float val_bl = row_b[x - 1]; + float val_bm = row_b[x]; + float val_br = row_b[x + 1]; + float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl + + val_bm + val_br); + row_out[x] = val_mm * kW0 + val1 * kW1; + } + } + } +} + +} // namespace jpegli +#endif // HWY_ONCE |