summaryrefslogtreecommitdiffstats
path: root/gfx/thebes/gfxAlphaRecoverySSE2.cpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /gfx/thebes/gfxAlphaRecoverySSE2.cpp
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/thebes/gfxAlphaRecoverySSE2.cpp')
-rw-r--r--gfx/thebes/gfxAlphaRecoverySSE2.cpp234
1 files changed, 234 insertions, 0 deletions
diff --git a/gfx/thebes/gfxAlphaRecoverySSE2.cpp b/gfx/thebes/gfxAlphaRecoverySSE2.cpp
new file mode 100644
index 0000000000..d64cb18bad
--- /dev/null
+++ b/gfx/thebes/gfxAlphaRecoverySSE2.cpp
@@ -0,0 +1,234 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "gfxAlphaRecovery.h"
+#include "gfxImageSurface.h"
+#include "nsDebug.h"
+#include <emmintrin.h>
+
+// This file should only be compiled on x86 and x64 systems. Additionally,
+// you'll need to compile it with -msse2 if you're using GCC on x86.
+
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
+__declspec(align(16)) static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00,
+ 0x0000ff00, 0x0000ff00};
+__declspec(align(16)) static uint32_t alphaMaski[] = {0xff000000, 0xff000000,
+ 0xff000000, 0xff000000};
+#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+static uint32_t greenMaski[] __attribute__((aligned(16))) = {
+ 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
+static uint32_t alphaMaski[] __attribute__((aligned(16))) = {
+ 0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
+# pragma align 16(greenMaski, alphaMaski)
+static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
+static uint32_t alphaMaski[] = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#endif
+
+bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
+ const gfxImageSurface* whiteSurf) {
+ mozilla::gfx::IntSize size = blackSurf->GetSize();
+
+ if (size != whiteSurf->GetSize() ||
+ (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
+ blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
+ (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
+ whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
+ return false;
+
+ blackSurf->Flush();
+ whiteSurf->Flush();
+
+ unsigned char* blackData = blackSurf->Data();
+ unsigned char* whiteData = whiteSurf->Data();
+
+ if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
+ (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
+ (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
+ // Cannot keep these in alignment.
+ return false;
+ }
+
+ __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
+ __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
+
+ for (int32_t i = 0; i < size.height; ++i) {
+ int32_t j = 0;
+ // Loop single pixels until at 4 byte alignment.
+ while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
+ *((uint32_t*)blackData) =
+ RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
+ *reinterpret_cast<uint32_t*>(whiteData));
+ blackData += 4;
+ whiteData += 4;
+ j++;
+ }
+ // This extra loop allows the compiler to do some more clever registry
+ // management and makes it about 5% faster than with only the 4 pixel
+ // at a time loop.
+ for (; j < size.width - 8; j += 8) {
+ __m128i black1 = _mm_load_si128((__m128i*)blackData);
+ __m128i white1 = _mm_load_si128((__m128i*)whiteData);
+ __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
+ __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
+
+ // Execute the same instructions as described in RecoverPixel, only
+ // using an SSE2 packed saturated subtract.
+ white1 = _mm_subs_epu8(white1, black1);
+ white2 = _mm_subs_epu8(white2, black2);
+ white1 = _mm_subs_epu8(greenMask, white1);
+ white2 = _mm_subs_epu8(greenMask, white2);
+ // Producing the final black pixel in an XMM register and storing
+ // that is actually faster than doing a masked store since that
+ // does an unaligned storage. We have the black pixel in a register
+ // anyway.
+ black1 = _mm_andnot_si128(alphaMask, black1);
+ black2 = _mm_andnot_si128(alphaMask, black2);
+ white1 = _mm_slli_si128(white1, 2);
+ white2 = _mm_slli_si128(white2, 2);
+ white1 = _mm_and_si128(alphaMask, white1);
+ white2 = _mm_and_si128(alphaMask, white2);
+ black1 = _mm_or_si128(white1, black1);
+ black2 = _mm_or_si128(white2, black2);
+
+ _mm_store_si128((__m128i*)blackData, black1);
+ _mm_store_si128((__m128i*)(blackData + 16), black2);
+ blackData += 32;
+ whiteData += 32;
+ }
+ for (; j < size.width - 4; j += 4) {
+ __m128i black = _mm_load_si128((__m128i*)blackData);
+ __m128i white = _mm_load_si128((__m128i*)whiteData);
+
+ white = _mm_subs_epu8(white, black);
+ white = _mm_subs_epu8(greenMask, white);
+ black = _mm_andnot_si128(alphaMask, black);
+ white = _mm_slli_si128(white, 2);
+ white = _mm_and_si128(alphaMask, white);
+ black = _mm_or_si128(white, black);
+ _mm_store_si128((__m128i*)blackData, black);
+ blackData += 16;
+ whiteData += 16;
+ }
+ // Loop single pixels until we're done.
+ while (j < size.width) {
+ *((uint32_t*)blackData) =
+ RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
+ *reinterpret_cast<uint32_t*>(whiteData));
+ blackData += 4;
+ whiteData += 4;
+ j++;
+ }
+ blackData += blackSurf->Stride() - j * 4;
+ whiteData += whiteSurf->Stride() - j * 4;
+ }
+
+ blackSurf->MarkDirty();
+
+ return true;
+}
+
+static int32_t ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY = 0,
+ int32_t aStride = 1) {
+ return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
+}
+
+/*static*/ mozilla::gfx::IntRect gfxAlphaRecovery::AlignRectForSubimageRecovery(
+ const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface) {
+ NS_ASSERTION(
+ mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
+ "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
+ static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
+ static const int32_t bpp = 4;
+ static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
+ //
+ // We're going to create a subimage of the surface with size
+ // <sw,sh> for alpha recovery, and want a SIMD fast-path. The
+ // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
+ // properly aligned for SIMD. So we want to find a rect <x',y',
+ // w',h'> that's a superset of what needs to be redrawn but is
+ // properly aligned. Proper alignment is
+ //
+ // BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
+ // BPP * w' \cong BPP * sw (mod ALIGN)
+ //
+ // (We assume the pixel at surface <0,0> is already ALIGN'd.)
+ // That rect (obviously) has to fit within the surface bounds, and
+ // we should also minimize the extra pixels redrawn only for
+ // alignment's sake. So we also want
+ //
+ // minimize <x',y', w',h'>
+ // 0 <= x' <= x
+ // 0 <= y' <= y
+ // w <= w' <= sw
+ // h <= h' <= sh
+ //
+ // This is a messy integer non-linear programming problem, except
+ // ... we can assume that ALIGN/BPP is a very small constant. So,
+ // brute force is viable. The algorithm below will find a
+ // solution if one exists, but isn't guaranteed to find the
+ // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
+ // most 64 iterations below). In what's likely the common case,
+ // an already-aligned rectangle, it only needs 1 iteration.
+ //
+ // Is this alignment worth doing? Recovering alpha will take work
+ // proportional to w*h (assuming alpha recovery computation isn't
+ // memory bound). This analysis can lead to O(w+h) extra work
+ // (with small constants). In exchange, we expect to shave off a
+ // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
+ // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
+ // only really care about the w*h >> w+h case anyway; others
+ // should be fast enough even with the overhead. (Unless the cost
+ // of repainting the expanded rect is high, but in that case
+ // SIMD-ized alpha recovery won't make a difference so this code
+ // shouldn't be called.)
+ //
+ mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
+ const int32_t stride = bpp * surfaceSize.width;
+ if (stride != aSurface->Stride()) {
+ NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
+ return aRect;
+ }
+
+ const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(),
+ h = aRect.Height();
+ const int32_t r = x + w;
+ const int32_t sw = surfaceSize.width;
+ const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
+
+ // The outer two loops below keep the rightmost (|r| above) and
+ // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
+ // return only a superset of the original rect. These loops
+ // search for an aligned top-left pixel by trying to expand <x,y>
+ // left and up by <dx,dy> pixels, respectively.
+ //
+ // Then if a properly-aligned top-left pixel is found, the
+ // innermost loop tries to find an aligned stride by moving the
+ // rightmost pixel rightward by dr.
+ int32_t dx, dy, dr;
+ for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
+ for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
+ if (0 != ByteAlignment(kByteAlignLog2, bpp * (x - dx), y - dy, stride)) {
+ continue;
+ }
+ for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
+ if (strideAlign == ByteAlignment(kByteAlignLog2, bpp * (w + dr + dx))) {
+ goto FOUND_SOLUTION;
+ }
+ }
+ }
+ }
+
+ // Didn't find a solution.
+ return aRect;
+
+FOUND_SOLUTION:
+ mozilla::gfx::IntRect solution =
+ mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
+ MOZ_ASSERT(
+ mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
+ "'Solution' extends outside surface bounds!");
+ return solution;
+}