summaryrefslogtreecommitdiffstats
path: root/gfx/2d/SwizzleNEON.cpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /gfx/2d/SwizzleNEON.cpp
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/2d/SwizzleNEON.cpp')
-rw-r--r--gfx/2d/SwizzleNEON.cpp451
1 files changed, 451 insertions, 0 deletions
diff --git a/gfx/2d/SwizzleNEON.cpp b/gfx/2d/SwizzleNEON.cpp
new file mode 100644
index 0000000000..887e93d632
--- /dev/null
+++ b/gfx/2d/SwizzleNEON.cpp
@@ -0,0 +1,451 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Swizzle.h"
+
+#include <arm_neon.h>
+
+namespace mozilla {
+namespace gfx {
+
+// Load 1-3 pixels into a 4 pixel vector.
+static MOZ_ALWAYS_INLINE uint16x8_t LoadRemainder_NEON(const uint8_t* aSrc,
+ size_t aLength) {
+ const uint32_t* src32 = reinterpret_cast<const uint32_t*>(aSrc);
+ uint32x4_t dst32;
+ if (aLength >= 2) {
+ // Load first 2 pixels
+ dst32 = vcombine_u32(vld1_u32(src32), vdup_n_u32(0));
+ // Load third pixel
+ if (aLength >= 3) {
+ dst32 = vld1q_lane_u32(src32 + 2, dst32, 2);
+ }
+ } else {
+ // Load single pixel
+ dst32 = vld1q_lane_u32(src32, vdupq_n_u32(0), 0);
+ }
+ return vreinterpretq_u16_u32(dst32);
+}
+
+// Store 1-3 pixels from a vector into memory without overwriting.
+static MOZ_ALWAYS_INLINE void StoreRemainder_NEON(uint8_t* aDst, size_t aLength,
+ const uint16x8_t& aSrc) {
+ uint32_t* dst32 = reinterpret_cast<uint32_t*>(aDst);
+ uint32x4_t src32 = vreinterpretq_u32_u16(aSrc);
+ if (aLength >= 2) {
+ // Store first 2 pixels
+ vst1_u32(dst32, vget_low_u32(src32));
+ // Store third pixel
+ if (aLength >= 3) {
+ vst1q_lane_u32(dst32 + 2, src32, 2);
+ }
+ } else {
+ // Store single pixel
+ vst1q_lane_u32(dst32, src32, 0);
+ }
+}
+
+// Premultiply vector of 4 pixels using splayed math.
+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE uint16x8_t
+PremultiplyVector_NEON(const uint16x8_t& aSrc) {
+ // Isolate R and B with mask.
+ const uint16x8_t mask = vdupq_n_u16(0x00FF);
+ uint16x8_t rb = vandq_u16(aSrc, mask);
+ // Swap R and B if necessary.
+ if (aSwapRB) {
+ rb = vrev32q_u16(rb);
+ }
+ // Isolate G and A by shifting down to bottom of word.
+ uint16x8_t ga = vshrq_n_u16(aSrc, 8);
+
+ // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
+ uint16x8_t alphas = vtrnq_u16(ga, ga).val[1];
+
+ // rb = rb*a + 255; rb += rb >> 8;
+ rb = vmlaq_u16(mask, rb, alphas);
+ rb = vsraq_n_u16(rb, rb, 8);
+
+ // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
+ if (!aOpaqueAlpha) {
+ ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0x00FF0000)));
+ }
+ // ga = ga*a + 255; ga += ga >> 8;
+ ga = vmlaq_u16(mask, ga, alphas);
+ ga = vsraq_n_u16(ga, ga, 8);
+ // If format is opaque, force output A to be 255.
+ if (aOpaqueAlpha) {
+ ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
+ }
+
+ // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
+ return vsriq_n_u16(ga, rb, 8);
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
+ uint8_t*& aDst,
+ int32_t aAlignedRow,
+ int32_t aRemainder) {
+ // Process all 4-pixel chunks as one vector.
+ for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+ uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
+ px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+ vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
+ aSrc += 4 * 4;
+ aDst += 4 * 4;
+ }
+
+ // Handle any 1-3 remaining pixels.
+ if (aRemainder) {
+ uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
+ px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+ StoreRemainder_NEON(aDst, aRemainder, px);
+ }
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+ int32_t alignedRow = 4 * (aLength & ~3);
+ int32_t remainder = aLength & 3;
+ PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+ remainder);
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
+ int32_t aDstGap, IntSize aSize) {
+ int32_t alignedRow = 4 * (aSize.width & ~3);
+ int32_t remainder = aSize.width & 3;
+ // Fold remainder into stride gap.
+ aSrcGap += 4 * remainder;
+ aDstGap += 4 * remainder;
+
+ for (int32_t height = aSize.height; height > 0; height--) {
+ PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+ remainder);
+ aSrc += aSrcGap;
+ aDst += aDstGap;
+ }
+}
+
+// Force instantiation of premultiply variants here.
+template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
+ int32_t);
+template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
+ int32_t);
+template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
+ int32_t);
+template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
+ int32_t);
+template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+template void Premultiply_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+template void Premultiply_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+
+// This generates a table of fixed-point reciprocals representing 1/alpha
+// similar to the fallback implementation. However, the reciprocal must
+// ultimately be multiplied as an unsigned 9 bit upper part and a signed
+// 15 bit lower part to cheaply multiply. Thus, the lower 15 bits of the
+// reciprocal is stored 15 bits of the reciprocal are masked off and
+// stored in the low word. The upper 9 bits are masked and shifted to fit
+// into the high word. These then get independently multiplied with the
+// color component and recombined to provide the full recriprocal multiply.
+#define UNPREMULQ_NEON(x) \
+ ((((0xFF00FFU / (x)) & 0xFF8000U) << 1) | ((0xFF00FFU / (x)) & 0x7FFFU))
+#define UNPREMULQ_NEON_2(x) UNPREMULQ_NEON(x), UNPREMULQ_NEON((x) + 1)
+#define UNPREMULQ_NEON_4(x) UNPREMULQ_NEON_2(x), UNPREMULQ_NEON_2((x) + 2)
+#define UNPREMULQ_NEON_8(x) UNPREMULQ_NEON_4(x), UNPREMULQ_NEON_4((x) + 4)
+#define UNPREMULQ_NEON_16(x) UNPREMULQ_NEON_8(x), UNPREMULQ_NEON_8((x) + 8)
+#define UNPREMULQ_NEON_32(x) UNPREMULQ_NEON_16(x), UNPREMULQ_NEON_16((x) + 16)
+static const uint32_t sUnpremultiplyTable_NEON[256] = {0,
+ UNPREMULQ_NEON(1),
+ UNPREMULQ_NEON_2(2),
+ UNPREMULQ_NEON_4(4),
+ UNPREMULQ_NEON_8(8),
+ UNPREMULQ_NEON_16(16),
+ UNPREMULQ_NEON_32(32),
+ UNPREMULQ_NEON_32(64),
+ UNPREMULQ_NEON_32(96),
+ UNPREMULQ_NEON_32(128),
+ UNPREMULQ_NEON_32(160),
+ UNPREMULQ_NEON_32(192),
+ UNPREMULQ_NEON_32(224)};
+
+// Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
+// that avoids doing any actual division.
+template <bool aSwapRB>
+static MOZ_ALWAYS_INLINE uint16x8_t
+UnpremultiplyVector_NEON(const uint16x8_t& aSrc) {
+ // Isolate R and B with mask.
+ uint16x8_t rb = vandq_u16(aSrc, vdupq_n_u16(0x00FF));
+ // Swap R and B if necessary.
+ if (aSwapRB) {
+ rb = vrev32q_u16(rb);
+ }
+
+ // Isolate G and A by shifting down to bottom of word.
+ uint16x8_t ga = vshrq_n_u16(aSrc, 8);
+ // Extract the alphas for the 4 pixels from the now isolated words.
+ int a1 = vgetq_lane_u16(ga, 1);
+ int a2 = vgetq_lane_u16(ga, 3);
+ int a3 = vgetq_lane_u16(ga, 5);
+ int a4 = vgetq_lane_u16(ga, 7);
+
+ // First load all of the interleaved low and high portions of the reciprocals
+ // and combine them a single vector as lo1 hi1 lo2 hi2 lo3 hi3 lo4 hi4
+ uint16x8_t q1234 = vreinterpretq_u16_u32(vld1q_lane_u32(
+ &sUnpremultiplyTable_NEON[a4],
+ vld1q_lane_u32(
+ &sUnpremultiplyTable_NEON[a3],
+ vld1q_lane_u32(
+ &sUnpremultiplyTable_NEON[a2],
+ vld1q_lane_u32(&sUnpremultiplyTable_NEON[a1], vdupq_n_u32(0), 0),
+ 1),
+ 2),
+ 3));
+ // Transpose the interleaved low/high portions so that we produce
+ // two separate duplicated vectors for the low and high portions respectively:
+ // lo1 lo1 lo2 lo2 lo3 lo3 lo4 lo4 and hi1 hi1 hi2 hi2 hi3 hi3 hi4 hi4
+ uint16x8x2_t q1234lohi = vtrnq_u16(q1234, q1234);
+
+ // VQDMULH is a signed multiply that doubles (*2) the result, then takes the
+ // high word. To work around the signedness and the doubling, the low
+ // portion of the reciprocal only stores the lower 15 bits, which fits in a
+ // signed 16 bit integer. The high 9 bit portion is effectively also doubled
+ // by 2 as a side-effect of being shifted for storage. Thus the output scale
+ // of doing a normal multiply by the high portion and the VQDMULH by the low
+ // portion are both doubled and can be safely added together. The resulting
+ // sum just needs to be halved (via VHADD) to thus cancel out the doubling.
+ // All this combines to produce a reciprocal multiply of the form:
+ // rb = ((rb * hi) + ((rb * lo * 2) >> 16)) / 2
+ rb = vhaddq_u16(
+ vmulq_u16(rb, q1234lohi.val[1]),
+ vreinterpretq_u16_s16(vqdmulhq_s16(
+ vreinterpretq_s16_u16(rb), vreinterpretq_s16_u16(q1234lohi.val[0]))));
+
+ // ga = ((ga * hi) + ((ga * lo * 2) >> 16)) / 2
+ ga = vhaddq_u16(
+ vmulq_u16(ga, q1234lohi.val[1]),
+ vreinterpretq_u16_s16(vqdmulhq_s16(
+ vreinterpretq_s16_u16(ga), vreinterpretq_s16_u16(q1234lohi.val[0]))));
+
+ // Combine to the final pixel with ((rb | (ga << 8)) & ~0xFF000000) | (aSrc &
+ // 0xFF000000), which inserts back in the original alpha value unchanged.
+ return vbslq_u16(vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)), aSrc,
+ vsliq_n_u16(rb, ga, 8));
+}
+
+template <bool aSwapRB>
+static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_NEON(const uint8_t*& aSrc,
+ uint8_t*& aDst,
+ int32_t aAlignedRow,
+ int32_t aRemainder) {
+ // Process all 4-pixel chunks as one vector.
+ for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+ uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
+ px = UnpremultiplyVector_NEON<aSwapRB>(px);
+ vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
+ aSrc += 4 * 4;
+ aDst += 4 * 4;
+ }
+
+ // Handle any 1-3 remaining pixels.
+ if (aRemainder) {
+ uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
+ px = UnpremultiplyVector_NEON<aSwapRB>(px);
+ StoreRemainder_NEON(aDst, aRemainder, px);
+ }
+}
+
+template <bool aSwapRB>
+void UnpremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst,
+ int32_t aLength) {
+ int32_t alignedRow = 4 * (aLength & ~3);
+ int32_t remainder = aLength & 3;
+ UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
+}
+
+template <bool aSwapRB>
+void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
+ int32_t aDstGap, IntSize aSize) {
+ int32_t alignedRow = 4 * (aSize.width & ~3);
+ int32_t remainder = aSize.width & 3;
+ // Fold remainder into stride gap.
+ aSrcGap += 4 * remainder;
+ aDstGap += 4 * remainder;
+
+ for (int32_t height = aSize.height; height > 0; height--) {
+ UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
+ aSrc += aSrcGap;
+ aDst += aDstGap;
+ }
+}
+
+// Force instantiation of unpremultiply variants here.
+template void UnpremultiplyRow_NEON<false>(const uint8_t*, uint8_t*, int32_t);
+template void UnpremultiplyRow_NEON<true>(const uint8_t*, uint8_t*, int32_t);
+template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+
+// Swizzle a vector of 4 pixels providing swaps and opaquifying.
+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
+ // Swap R and B, then add to G and A (forced to 255):
+ // (((src>>16) | (src << 16)) & 0x00FF00FF) |
+ // ((src | 0xFF000000) & ~0x00FF00FF)
+ return vbslq_u16(
+ vdupq_n_u16(0x00FF), vrev32q_u16(aSrc),
+ aOpaqueAlpha
+ ? vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)))
+ : aSrc);
+}
+
+#if 0
+// These specializations currently do not profile faster than the generic versions,
+// so disable them for now.
+
+// Optimized implementations for when there is no R and B swap.
+template<>
+static MOZ_ALWAYS_INLINE uint16x8_t
+SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
+{
+ // Force alpha to 255.
+ return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
+}
+
+template<>
+static MOZ_ALWAYS_INLINE uint16x8_t
+SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
+{
+ return aSrc;
+}
+#endif
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
+ uint8_t*& aDst,
+ int32_t aAlignedRow,
+ int32_t aRemainder) {
+ // Process all 4-pixel chunks as one vector.
+ for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+ uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
+ px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+ vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
+ aSrc += 4 * 4;
+ aDst += 4 * 4;
+ }
+
+ // Handle any 1-3 remaining pixels.
+ if (aRemainder) {
+ uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
+ px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+ StoreRemainder_NEON(aDst, aRemainder, px);
+ }
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+ int32_t alignedRow = 4 * (aLength & ~3);
+ int32_t remainder = aLength & 3;
+ SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
+ int32_t aDstGap, IntSize aSize) {
+ int32_t alignedRow = 4 * (aSize.width & ~3);
+ int32_t remainder = aSize.width & 3;
+ // Fold remainder into stride gap.
+ aSrcGap += 4 * remainder;
+ aDstGap += 4 * remainder;
+
+ for (int32_t height = aSize.height; height > 0; height--) {
+ SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
+ aSrc += aSrcGap;
+ aDst += aDstGap;
+ }
+}
+
+// Force instantiation of swizzle variants here.
+template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
+template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
+template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
+ int32_t, IntSize);
+
+template <bool aSwapRB>
+void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
+
+template <bool aSwapRB>
+void UnpackRowRGB24_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+ // Because this implementation will read an additional 4 bytes of data that
+ // is ignored and masked over, we cannot use the accelerated version for the
+ // last 1-5 pixels (3-15 bytes remaining) to guarantee we don't access memory
+ // outside the buffer (we read in 16 byte chunks).
+ if (aLength < 6) {
+ UnpackRowRGB24<aSwapRB>(aSrc, aDst, aLength);
+ return;
+ }
+
+ // Because we are expanding, we can only process the data back to front in
+ // case we are performing this in place.
+ int32_t alignedRow = (aLength - 2) & ~3;
+ int32_t remainder = aLength - alignedRow;
+
+ const uint8_t* src = aSrc + alignedRow * 3;
+ uint8_t* dst = aDst + alignedRow * 4;
+
+ // Handle 2-5 remaining pixels.
+ UnpackRowRGB24<aSwapRB>(src, dst, remainder);
+
+ uint8x8_t masklo;
+ uint8x8_t maskhi;
+ if (aSwapRB) {
+ static const uint8_t masklo_data[] = {2, 1, 0, 0, 5, 4, 3, 0};
+ static const uint8_t maskhi_data[] = {4, 3, 2, 0, 7, 6, 5, 0};
+ masklo = vld1_u8(masklo_data);
+ maskhi = vld1_u8(maskhi_data);
+ } else {
+ static const uint8_t masklo_data[] = {0, 1, 2, 0, 3, 4, 5, 0};
+ static const uint8_t maskhi_data[] = {2, 3, 4, 0, 5, 6, 7, 0};
+ masklo = vld1_u8(masklo_data);
+ maskhi = vld1_u8(maskhi_data);
+ }
+
+ uint8x16_t alpha = vreinterpretq_u8_u32(vdupq_n_u32(0xFF000000));
+
+ // Process all 4-pixel chunks as one vector.
+ src -= 4 * 3;
+ dst -= 4 * 4;
+ while (src >= aSrc) {
+ uint8x16_t px = vld1q_u8(src);
+ // G2R2B1G1 R1B0G0R0 -> X1R1G1B1 X0R0G0B0
+ uint8x8_t pxlo = vtbl1_u8(vget_low_u8(px), masklo);
+ // B3G3R3B2 G2R2B1G1 -> X3R3G3B3 X2R2G2B2
+ uint8x8_t pxhi =
+ vtbl1_u8(vext_u8(vget_low_u8(px), vget_high_u8(px), 4), maskhi);
+ px = vcombine_u8(pxlo, pxhi);
+ px = vorrq_u8(px, alpha);
+ vst1q_u8(dst, px);
+ src -= 4 * 3;
+ dst -= 4 * 4;
+ }
+}
+
+// Force instantiation of swizzle variants here.
+template void UnpackRowRGB24_NEON<false>(const uint8_t*, uint8_t*, int32_t);
+template void UnpackRowRGB24_NEON<true>(const uint8_t*, uint8_t*, int32_t);
+
+} // namespace gfx
+} // namespace mozilla