1 files changed, 297 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/convolve-inl.h b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
new file mode 100644
index 0000000000..054c9c6f0d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
@@ -0,0 +1,297 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_CONVOLVE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_CONVOLVE_INL_H_
+#undef LIB_JXL_CONVOLVE_INL_H_
+#else
+#define LIB_JXL_CONVOLVE_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image_ops.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Broadcast;
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
+#endif
+using hwy::HWY_NAMESPACE::TableLookupLanes;
+using hwy::HWY_NAMESPACE::Vec;
+
+// Synthesizes left/right neighbors from a vector of center pixels.
+class Neighbors {
+ public:
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+  // Returns l[i] == c[Mirror(i - 1)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL1(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {0, 0, 1, 2,  3,  4,  5,  6,
+                                             7, 8, 9, 10, 11, 12, 13, 14};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // ONML'KJII
+#elif HWY_TARGET == HWY_SCALAR
+    return c;  // Same (the first mirrored value is the last valid one)
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))};  // KJII
+#else
+    const D d;
+    // TODO(deymo): Figure out if this can be optimized using a single vsri
+    // instruction to convert LKJI to KJII.
+    HWY_ALIGN constexpr int lanes[4] = {0, 0, 1, 2};  // KJII
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+
+  // Returns l[i] == c[Mirror(i - 2)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL2(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {1, 0, 0, 1, 2,  3,  4,  5,
+                                             6, 7, 8, 9, 10, 11, 12, 13};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // NMLK'JIIJ
+#elif HWY_TARGET == HWY_SCALAR
+    const D d;
+    JXL_ASSERT(false);  // unsupported, avoid calling this.
+    return Zero(d);
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))};  // JIIJ
+#else
+    const D d;
+    HWY_ALIGN constexpr int lanes[4] = {1, 0, 0, 1};  // JIIJ
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+
+  // Returns l[i] == c[Mirror(i - 3)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL3(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {2, 1, 0, 0, 1, 2,  3,  4,
+                                             5, 6, 7, 8, 9, 10, 11, 12};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // MLKJ'IIJK
+#elif HWY_TARGET == HWY_SCALAR
+    const D d;
+    JXL_ASSERT(false);  // unsupported, avoid calling this.
+    return Zero(d);
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))};  // IIJK
+#else
+    const D d;
+    HWY_ALIGN constexpr int lanes[4] = {2, 1, 0, 0};  // IIJK
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+};
+
+#if HWY_TARGET != HWY_SCALAR
+
+// Returns indices for SetTableIndices such that TableLookupLanes on the
+// rightmost unaligned vector (rightmost sample in its most-significant lane)
+// returns the mirrored values, with the mirror outside the last valid sample.
+static inline const int32_t* MirrorLanes(const size_t mod) {
+  const HWY_CAPPED(float, 16) d;
+  constexpr size_t kN = MaxLanes(d);
+
+  // For mod = `image width mod 16` 0..15:
+  // last full vec     mirrored (mem order)  loadedVec  mirrorVec  idxVec
+  // 0123456789abcdef| fedcba9876543210      fed..210   012..def   012..def
+  // 0123456789abcdef|0 0fedcba98765432      0fe..321   234..f00   123..eff
+  // 0123456789abcdef|01 10fedcba987654      10f..432   456..110   234..ffe
+  // 0123456789abcdef|012 210fedcba9876      210..543   67..2210   34..ffed
+  // 0123456789abcdef|0123 3210fedcba98      321..654   8..33210   4..ffedc
+  // 0123456789abcdef|01234 43210fedcba
+  // 0123456789abcdef|012345 543210fedc
+  // 0123456789abcdef|0123456 6543210fe
+  // 0123456789abcdef|01234567 76543210
+  // 0123456789abcdef|012345678 8765432
+  // 0123456789abcdef|0123456789 987654
+  // 0123456789abcdef|0123456789A A9876
+  // 0123456789abcdef|0123456789AB BA98
+  // 0123456789abcdef|0123456789ABC CBA
+  // 0123456789abcdef|0123456789ABCD DC
+  // 0123456789abcdef|0123456789ABCDE E      EDC..10f   EED..210   ffe..321
+#if HWY_CAP_GE512
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+      1,  2,  3,  4,  5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,  //
+      14, 13, 12, 11, 10, 9, 8, 7, 6, 5,  4,  3,  2,  1,  0};
+#elif HWY_CAP_GE256
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+      1, 2, 3, 4, 5, 6, 7, 7,  //
+      6, 5, 4, 3, 2, 1, 0};
+#else  // 128-bit
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3,  //
+                                                              2, 1, 0};
+#endif
+  return idx_lanes + kN - 1 - mod;
+}
+
+#endif  // HWY_TARGET != HWY_SCALAR
+
+// Single entry point for convolution.
+// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it.
+template <class Strategy>
+class ConvolveT {
+  static constexpr int64_t kRadius = Strategy::kRadius;
+  using Simd = HWY_CAPPED(float, 16);
+
+ public:
+  static size_t MinWidth() {
+#if HWY_TARGET == HWY_SCALAR
+    // First/Last use mirrored loads of up to +/- kRadius.
+    return 2 * kRadius;
+#else
+    return Lanes(Simd()) + kRadius;
+#endif
+  }
+
+  // "Image" is ImageF or Image3F.
+  template <class Image, class Weights>
+  static void Run(const Image& in, const Rect& rect, const Weights& weights,
+                  ThreadPool* pool, Image* out) {
+    PROFILER_ZONE("ConvolveT::Run");
+    JXL_CHECK(SameSize(rect, *out));
+    JXL_CHECK(rect.xsize() >= MinWidth());
+
+    static_assert(int64_t(kRadius) <= 3,
+                  "Must handle [0, kRadius) and >= kRadius");
+    switch (rect.xsize() % Lanes(Simd())) {
+      case 0:
+        return RunRows<0>(in, rect, weights, pool, out);
+      case 1:
+        return RunRows<1>(in, rect, weights, pool, out);
+      case 2:
+        return RunRows<2>(in, rect, weights, pool, out);
+      default:
+        return RunRows<3>(in, rect, weights, pool, out);
+    }
+  }
+
+ private:
+  template <size_t kSizeModN, class WrapRow, class Weights>
+  static JXL_INLINE void RunRow(const float* JXL_RESTRICT in,
+                                const size_t xsize, const int64_t stride,
+                                const WrapRow& wrap_row, const Weights& weights,
+                                float* JXL_RESTRICT out) {
+    Strategy::template ConvolveRow<kSizeModN>(in, xsize, stride, wrap_row,
+                                              weights, out);
+  }
+
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect,
+                                       const int64_t ybegin, const int64_t yend,
+                                       const Weights& weights, ImageF* out) {
+    const int64_t stride = in.PixelsPerRow();
+    const WrapRowMirror wrap_row(in, rect.ysize());
+    for (int64_t y = ybegin; y < yend; ++y) {
+      RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row,
+                        weights, out->Row(y));
+    }
+  }
+
+  // Image3F.
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect,
+                                       const int64_t ybegin, const int64_t yend,
+                                       const Weights& weights, Image3F* out) {
+    const int64_t stride = in.PixelsPerRow();
+    for (int64_t y = ybegin; y < yend; ++y) {
+      for (size_t c = 0; c < 3; ++c) {
+        const WrapRowMirror wrap_row(in.Plane(c), rect.ysize());
+        RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride,
+                          wrap_row, weights, out->PlaneRow(c, y));
+      }
+    }
+  }
+
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect,
+                                         const int64_t ybegin,
+                                         const int64_t yend,
+                                         const Weights& weights,
+                                         ThreadPool* pool, ImageF* out) {
+    const int64_t stride = in.PixelsPerRow();
+    JXL_CHECK(RunOnPool(
+        pool, ybegin, yend, ThreadPool::NoInit,
+        [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+          RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride,
+                            WrapRowUnchanged(), weights, out->Row(y));
+        },
+        "Convolve"));
+  }
+
+  // Image3F.
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect,
+                                         const int64_t ybegin,
+                                         const int64_t yend,
+                                         const Weights& weights,
+                                         ThreadPool* pool, Image3F* out) {
+    const int64_t stride = in.PixelsPerRow();
+    JXL_CHECK(RunOnPool(
+        pool, ybegin, yend, ThreadPool::NoInit,
+        [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+          for (size_t c = 0; c < 3; ++c) {
+            RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(),
+                              stride, WrapRowUnchanged(), weights,
+                              out->PlaneRow(c, y));
+          }
+        },
+        "Convolve3"));
+  }
+
+  template <size_t kSizeModN, class Image, class Weights>
+  static JXL_INLINE void RunRows(const Image& in, const Rect& rect,
+                                 const Weights& weights, ThreadPool* pool,
+                                 Image* out) {
+    const int64_t ysize = rect.ysize();
+    RunBorderRows<kSizeModN>(in, rect, 0, std::min(int64_t(kRadius), ysize),
+                             weights, out);
+    if (ysize > 2 * int64_t(kRadius)) {
+      RunInteriorRows<kSizeModN>(in, rect, int64_t(kRadius),
+                                 ysize - int64_t(kRadius), weights, pool, out);
+    }
+    if (ysize > int64_t(kRadius)) {
+      RunBorderRows<kSizeModN>(in, rect, ysize - int64_t(kRadius), ysize,
+                               weights, out);
+    }
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_CONVOLVE_INL_H_