summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/convolve-inl.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
commit6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/jpeg-xl/lib/jxl/convolve-inl.h
parentInitial commit. (diff)
downloadthunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/convolve-inl.h')
-rw-r--r--third_party/jpeg-xl/lib/jxl/convolve-inl.h297
1 files changed, 297 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/convolve-inl.h b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
new file mode 100644
index 0000000000..054c9c6f0d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
@@ -0,0 +1,297 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_CONVOLVE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_CONVOLVE_INL_H_
+#undef LIB_JXL_CONVOLVE_INL_H_
+#else
+#define LIB_JXL_CONVOLVE_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image_ops.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Broadcast;
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
+#endif
+using hwy::HWY_NAMESPACE::TableLookupLanes;
+using hwy::HWY_NAMESPACE::Vec;
+
+// Synthesizes left/right neighbors from a vector of center pixels.
+class Neighbors {
+ public:
+ using D = HWY_CAPPED(float, 16);
+ using V = Vec<D>;
+
+ // Returns l[i] == c[Mirror(i - 1)].
+ HWY_INLINE HWY_MAYBE_UNUSED static V FirstL1(const V c) {
+#if HWY_CAP_GE256
+ const D d;
+ HWY_ALIGN constexpr int32_t lanes[16] = {0, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14};
+ const auto indices = SetTableIndices(d, lanes);
+ // c = PONM'LKJI
+ return TableLookupLanes(c, indices); // ONML'KJII
+#elif HWY_TARGET == HWY_SCALAR
+ return c; // Same (the first mirrored value is the last valid one)
+#else // 128 bit
+ // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+ return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))}; // KJII
+#else
+ const D d;
+ // TODO(deymo): Figure out if this can be optimized using a single vsri
+ // instruction to convert LKJI to KJII.
+ HWY_ALIGN constexpr int lanes[4] = {0, 0, 1, 2}; // KJII
+ const auto indices = SetTableIndices(d, lanes);
+ return TableLookupLanes(c, indices);
+#endif
+#endif
+ }
+
+ // Returns l[i] == c[Mirror(i - 2)].
+ HWY_INLINE HWY_MAYBE_UNUSED static V FirstL2(const V c) {
+#if HWY_CAP_GE256
+ const D d;
+ HWY_ALIGN constexpr int32_t lanes[16] = {1, 0, 0, 1, 2, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13};
+ const auto indices = SetTableIndices(d, lanes);
+ // c = PONM'LKJI
+ return TableLookupLanes(c, indices); // NMLK'JIIJ
+#elif HWY_TARGET == HWY_SCALAR
+ const D d;
+ JXL_ASSERT(false); // unsupported, avoid calling this.
+ return Zero(d);
+#else // 128 bit
+ // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+ return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))}; // JIIJ
+#else
+ const D d;
+ HWY_ALIGN constexpr int lanes[4] = {1, 0, 0, 1}; // JIIJ
+ const auto indices = SetTableIndices(d, lanes);
+ return TableLookupLanes(c, indices);
+#endif
+#endif
+ }
+
+ // Returns l[i] == c[Mirror(i - 3)].
+ HWY_INLINE HWY_MAYBE_UNUSED static V FirstL3(const V c) {
+#if HWY_CAP_GE256
+ const D d;
+ HWY_ALIGN constexpr int32_t lanes[16] = {2, 1, 0, 0, 1, 2, 3, 4,
+ 5, 6, 7, 8, 9, 10, 11, 12};
+ const auto indices = SetTableIndices(d, lanes);
+ // c = PONM'LKJI
+ return TableLookupLanes(c, indices); // MLKJ'IIJK
+#elif HWY_TARGET == HWY_SCALAR
+ const D d;
+ JXL_ASSERT(false); // unsupported, avoid calling this.
+ return Zero(d);
+#else // 128 bit
+ // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+ return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))}; // IIJK
+#else
+ const D d;
+ HWY_ALIGN constexpr int lanes[4] = {2, 1, 0, 0}; // IIJK
+ const auto indices = SetTableIndices(d, lanes);
+ return TableLookupLanes(c, indices);
+#endif
+#endif
+ }
+};
+
+#if HWY_TARGET != HWY_SCALAR
+
+// Returns indices for SetTableIndices such that TableLookupLanes on the
+// rightmost unaligned vector (rightmost sample in its most-significant lane)
+// returns the mirrored values, with the mirror outside the last valid sample.
+static inline const int32_t* MirrorLanes(const size_t mod) {
+ const HWY_CAPPED(float, 16) d;
+ constexpr size_t kN = MaxLanes(d);
+
+ // For mod = `image width mod 16` 0..15:
+ // last full vec mirrored (mem order) loadedVec mirrorVec idxVec
+ // 0123456789abcdef| fedcba9876543210 fed..210 012..def 012..def
+ // 0123456789abcdef|0 0fedcba98765432 0fe..321 234..f00 123..eff
+ // 0123456789abcdef|01 10fedcba987654 10f..432 456..110 234..ffe
+ // 0123456789abcdef|012 210fedcba9876 210..543 67..2210 34..ffed
+ // 0123456789abcdef|0123 3210fedcba98 321..654 8..33210 4..ffedc
+ // 0123456789abcdef|01234 43210fedcba
+ // 0123456789abcdef|012345 543210fedc
+ // 0123456789abcdef|0123456 6543210fe
+ // 0123456789abcdef|01234567 76543210
+ // 0123456789abcdef|012345678 8765432
+ // 0123456789abcdef|0123456789 987654
+ // 0123456789abcdef|0123456789A A9876
+ // 0123456789abcdef|0123456789AB BA98
+ // 0123456789abcdef|0123456789ABC CBA
+ // 0123456789abcdef|0123456789ABCD DC
+ // 0123456789abcdef|0123456789ABCDE E EDC..10f EED..210 ffe..321
+#if HWY_CAP_GE512
+ HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, //
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+#elif HWY_CAP_GE256
+ HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+ 1, 2, 3, 4, 5, 6, 7, 7, //
+ 6, 5, 4, 3, 2, 1, 0};
+#else // 128-bit
+ HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3, //
+ 2, 1, 0};
+#endif
+ return idx_lanes + kN - 1 - mod;
+}
+
+#endif // HWY_TARGET != HWY_SCALAR
+
+// Single entry point for convolution.
+// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it.
+template <class Strategy>
+class ConvolveT {
+ static constexpr int64_t kRadius = Strategy::kRadius;
+ using Simd = HWY_CAPPED(float, 16);
+
+ public:
+ static size_t MinWidth() {
+#if HWY_TARGET == HWY_SCALAR
+ // First/Last use mirrored loads of up to +/- kRadius.
+ return 2 * kRadius;
+#else
+ return Lanes(Simd()) + kRadius;
+#endif
+ }
+
+ // "Image" is ImageF or Image3F.
+ template <class Image, class Weights>
+ static void Run(const Image& in, const Rect& rect, const Weights& weights,
+ ThreadPool* pool, Image* out) {
+ PROFILER_ZONE("ConvolveT::Run");
+ JXL_CHECK(SameSize(rect, *out));
+ JXL_CHECK(rect.xsize() >= MinWidth());
+
+ static_assert(int64_t(kRadius) <= 3,
+ "Must handle [0, kRadius) and >= kRadius");
+ switch (rect.xsize() % Lanes(Simd())) {
+ case 0:
+ return RunRows<0>(in, rect, weights, pool, out);
+ case 1:
+ return RunRows<1>(in, rect, weights, pool, out);
+ case 2:
+ return RunRows<2>(in, rect, weights, pool, out);
+ default:
+ return RunRows<3>(in, rect, weights, pool, out);
+ }
+ }
+
+ private:
+ template <size_t kSizeModN, class WrapRow, class Weights>
+ static JXL_INLINE void RunRow(const float* JXL_RESTRICT in,
+ const size_t xsize, const int64_t stride,
+ const WrapRow& wrap_row, const Weights& weights,
+ float* JXL_RESTRICT out) {
+ Strategy::template ConvolveRow<kSizeModN>(in, xsize, stride, wrap_row,
+ weights, out);
+ }
+
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect,
+ const int64_t ybegin, const int64_t yend,
+ const Weights& weights, ImageF* out) {
+ const int64_t stride = in.PixelsPerRow();
+ const WrapRowMirror wrap_row(in, rect.ysize());
+ for (int64_t y = ybegin; y < yend; ++y) {
+ RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row,
+ weights, out->Row(y));
+ }
+ }
+
+ // Image3F.
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect,
+ const int64_t ybegin, const int64_t yend,
+ const Weights& weights, Image3F* out) {
+ const int64_t stride = in.PixelsPerRow();
+ for (int64_t y = ybegin; y < yend; ++y) {
+ for (size_t c = 0; c < 3; ++c) {
+ const WrapRowMirror wrap_row(in.Plane(c), rect.ysize());
+ RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride,
+ wrap_row, weights, out->PlaneRow(c, y));
+ }
+ }
+ }
+
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect,
+ const int64_t ybegin,
+ const int64_t yend,
+ const Weights& weights,
+ ThreadPool* pool, ImageF* out) {
+ const int64_t stride = in.PixelsPerRow();
+ JXL_CHECK(RunOnPool(
+ pool, ybegin, yend, ThreadPool::NoInit,
+ [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+ RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride,
+ WrapRowUnchanged(), weights, out->Row(y));
+ },
+ "Convolve"));
+ }
+
+ // Image3F.
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect,
+ const int64_t ybegin,
+ const int64_t yend,
+ const Weights& weights,
+ ThreadPool* pool, Image3F* out) {
+ const int64_t stride = in.PixelsPerRow();
+ JXL_CHECK(RunOnPool(
+ pool, ybegin, yend, ThreadPool::NoInit,
+ [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+ for (size_t c = 0; c < 3; ++c) {
+ RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(),
+ stride, WrapRowUnchanged(), weights,
+ out->PlaneRow(c, y));
+ }
+ },
+ "Convolve3"));
+ }
+
+ template <size_t kSizeModN, class Image, class Weights>
+ static JXL_INLINE void RunRows(const Image& in, const Rect& rect,
+ const Weights& weights, ThreadPool* pool,
+ Image* out) {
+ const int64_t ysize = rect.ysize();
+ RunBorderRows<kSizeModN>(in, rect, 0, std::min(int64_t(kRadius), ysize),
+ weights, out);
+ if (ysize > 2 * int64_t(kRadius)) {
+ RunInteriorRows<kSizeModN>(in, rect, int64_t(kRadius),
+ ysize - int64_t(kRadius), weights, pool, out);
+ }
+ if (ysize > int64_t(kRadius)) {
+ RunBorderRows<kSizeModN>(in, rect, ysize - int64_t(kRadius), ysize,
+ weights, out);
+ }
+ }
+};
+
+} // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif // LIB_JXL_CONVOLVE_INL_H_