diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /gfx/thebes/gfxAlphaRecoverySSE2.cpp | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/thebes/gfxAlphaRecoverySSE2.cpp')
-rw-r--r-- | gfx/thebes/gfxAlphaRecoverySSE2.cpp | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/gfx/thebes/gfxAlphaRecoverySSE2.cpp b/gfx/thebes/gfxAlphaRecoverySSE2.cpp new file mode 100644 index 0000000000..d64cb18bad --- /dev/null +++ b/gfx/thebes/gfxAlphaRecoverySSE2.cpp @@ -0,0 +1,234 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "gfxAlphaRecovery.h" +#include "gfxImageSurface.h" +#include "nsDebug.h" +#include <emmintrin.h> + +// This file should only be compiled on x86 and x64 systems. Additionally, +// you'll need to compile it with -msse2 if you're using GCC on x86. + +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) +__declspec(align(16)) static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, + 0x0000ff00, 0x0000ff00}; +__declspec(align(16)) static uint32_t alphaMaski[] = {0xff000000, 0xff000000, + 0xff000000, 0xff000000}; +#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +static uint32_t greenMaski[] __attribute__((aligned(16))) = { + 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00}; +static uint32_t alphaMaski[] __attribute__((aligned(16))) = { + 0xff000000, 0xff000000, 0xff000000, 0xff000000}; +#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) +# pragma align 16(greenMaski, alphaMaski) +static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00}; +static uint32_t alphaMaski[] = {0xff000000, 0xff000000, 0xff000000, 0xff000000}; +#endif + +bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, + const gfxImageSurface* whiteSurf) { + mozilla::gfx::IntSize size = blackSurf->GetSize(); + + if (size != whiteSurf->GetSize() || + (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 && + blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) || + (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 && + whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32)) + return false; + + blackSurf->Flush(); + whiteSurf->Flush(); + + unsigned char* blackData = blackSurf->Data(); + unsigned char* whiteData = whiteSurf->Data(); + + if ((NS_PTR_TO_UINT32(blackData) & 0xf) != + (NS_PTR_TO_UINT32(whiteData) & 0xf) || + (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { + // Cannot keep these in alignment. + return false; + } + + __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); + __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); + + for (int32_t i = 0; i < size.height; ++i) { + int32_t j = 0; + // Loop single pixels until at 4 byte alignment. + while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { + *((uint32_t*)blackData) = + RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), + *reinterpret_cast<uint32_t*>(whiteData)); + blackData += 4; + whiteData += 4; + j++; + } + // This extra loop allows the compiler to do some more clever registry + // management and makes it about 5% faster than with only the 4 pixel + // at a time loop. + for (; j < size.width - 8; j += 8) { + __m128i black1 = _mm_load_si128((__m128i*)blackData); + __m128i white1 = _mm_load_si128((__m128i*)whiteData); + __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); + __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); + + // Execute the same instructions as described in RecoverPixel, only + // using an SSE2 packed saturated subtract. + white1 = _mm_subs_epu8(white1, black1); + white2 = _mm_subs_epu8(white2, black2); + white1 = _mm_subs_epu8(greenMask, white1); + white2 = _mm_subs_epu8(greenMask, white2); + // Producing the final black pixel in an XMM register and storing + // that is actually faster than doing a masked store since that + // does an unaligned storage. We have the black pixel in a register + // anyway. + black1 = _mm_andnot_si128(alphaMask, black1); + black2 = _mm_andnot_si128(alphaMask, black2); + white1 = _mm_slli_si128(white1, 2); + white2 = _mm_slli_si128(white2, 2); + white1 = _mm_and_si128(alphaMask, white1); + white2 = _mm_and_si128(alphaMask, white2); + black1 = _mm_or_si128(white1, black1); + black2 = _mm_or_si128(white2, black2); + + _mm_store_si128((__m128i*)blackData, black1); + _mm_store_si128((__m128i*)(blackData + 16), black2); + blackData += 32; + whiteData += 32; + } + for (; j < size.width - 4; j += 4) { + __m128i black = _mm_load_si128((__m128i*)blackData); + __m128i white = _mm_load_si128((__m128i*)whiteData); + + white = _mm_subs_epu8(white, black); + white = _mm_subs_epu8(greenMask, white); + black = _mm_andnot_si128(alphaMask, black); + white = _mm_slli_si128(white, 2); + white = _mm_and_si128(alphaMask, white); + black = _mm_or_si128(white, black); + _mm_store_si128((__m128i*)blackData, black); + blackData += 16; + whiteData += 16; + } + // Loop single pixels until we're done. + while (j < size.width) { + *((uint32_t*)blackData) = + RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), + *reinterpret_cast<uint32_t*>(whiteData)); + blackData += 4; + whiteData += 4; + j++; + } + blackData += blackSurf->Stride() - j * 4; + whiteData += whiteSurf->Stride() - j * 4; + } + + blackSurf->MarkDirty(); + + return true; +} + +static int32_t ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY = 0, + int32_t aStride = 1) { + return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); +} + +/*static*/ mozilla::gfx::IntRect gfxAlphaRecovery::AlignRectForSubimageRecovery( + const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface) { + NS_ASSERTION( + mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(), + "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); + static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); + static const int32_t bpp = 4; + static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; + // + // We're going to create a subimage of the surface with size + // <sw,sh> for alpha recovery, and want a SIMD fast-path. The + // rect <x,y, w,h> /needs/ to be redrawn, but it might not be + // properly aligned for SIMD. So we want to find a rect <x',y', + // w',h'> that's a superset of what needs to be redrawn but is + // properly aligned. Proper alignment is + // + // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) + // BPP * w' \cong BPP * sw (mod ALIGN) + // + // (We assume the pixel at surface <0,0> is already ALIGN'd.) + // That rect (obviously) has to fit within the surface bounds, and + // we should also minimize the extra pixels redrawn only for + // alignment's sake. So we also want + // + // minimize <x',y', w',h'> + // 0 <= x' <= x + // 0 <= y' <= y + // w <= w' <= sw + // h <= h' <= sh + // + // This is a messy integer non-linear programming problem, except + // ... we can assume that ALIGN/BPP is a very small constant. So, + // brute force is viable. The algorithm below will find a + // solution if one exists, but isn't guaranteed to find the + // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at + // most 64 iterations below). In what's likely the common case, + // an already-aligned rectangle, it only needs 1 iteration. + // + // Is this alignment worth doing? Recovering alpha will take work + // proportional to w*h (assuming alpha recovery computation isn't + // memory bound). This analysis can lead to O(w+h) extra work + // (with small constants). In exchange, we expect to shave off a + // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as + // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We + // only really care about the w*h >> w+h case anyway; others + // should be fast enough even with the overhead. (Unless the cost + // of repainting the expanded rect is high, but in that case + // SIMD-ized alpha recovery won't make a difference so this code + // shouldn't be called.) + // + mozilla::gfx::IntSize surfaceSize = aSurface->GetSize(); + const int32_t stride = bpp * surfaceSize.width; + if (stride != aSurface->Stride()) { + NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); + return aRect; + } + + const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(), + h = aRect.Height(); + const int32_t r = x + w; + const int32_t sw = surfaceSize.width; + const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); + + // The outer two loops below keep the rightmost (|r| above) and + // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we + // return only a superset of the original rect. These loops + // search for an aligned top-left pixel by trying to expand <x,y> + // left and up by <dx,dy> pixels, respectively. + // + // Then if a properly-aligned top-left pixel is found, the + // innermost loop tries to find an aligned stride by moving the + // rightmost pixel rightward by dr. + int32_t dx, dy, dr; + for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { + for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { + if (0 != ByteAlignment(kByteAlignLog2, bpp * (x - dx), y - dy, stride)) { + continue; + } + for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { + if (strideAlign == ByteAlignment(kByteAlignLog2, bpp * (w + dr + dx))) { + goto FOUND_SOLUTION; + } + } + } + } + + // Didn't find a solution. + return aRect; + +FOUND_SOLUTION: + mozilla::gfx::IntRect solution = + mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy); + MOZ_ASSERT( + mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution), + "'Solution' extends outside surface bounds!"); + return solution; +} |