diff options
Diffstat (limited to 'gfx/thebes/gfxAlphaRecoveryGeneric.h')
-rw-r--r-- | gfx/thebes/gfxAlphaRecoveryGeneric.h | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/gfx/thebes/gfxAlphaRecoveryGeneric.h b/gfx/thebes/gfxAlphaRecoveryGeneric.h new file mode 100644 index 0000000000..84db0fea0e --- /dev/null +++ b/gfx/thebes/gfxAlphaRecoveryGeneric.h @@ -0,0 +1,129 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef _GFXALPHARECOVERY_GENERIC_H_ +#define _GFXALPHARECOVERY_GENERIC_H_ + +#include "gfxAlphaRecovery.h" +#include "gfxImageSurface.h" +#include "nsDebug.h" +#include <xsimd/xsimd.hpp> + +template <typename Arch> +bool gfxAlphaRecovery::RecoverAlphaGeneric(gfxImageSurface* blackSurf, + const gfxImageSurface* whiteSurf) { + mozilla::gfx::IntSize size = blackSurf->GetSize(); + + if (size != whiteSurf->GetSize() || + (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 && + blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) || + (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 && + whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32)) + return false; + + blackSurf->Flush(); + whiteSurf->Flush(); + + unsigned char* blackData = blackSurf->Data(); + unsigned char* whiteData = whiteSurf->Data(); + + if ((NS_PTR_TO_UINT32(blackData) & 0xf) != + (NS_PTR_TO_UINT32(whiteData) & 0xf) || + (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { + // Cannot keep these in alignment. + return false; + } + + alignas(Arch::alignment()) static const uint8_t greenMaski[] = { + 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, + 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, + }; + alignas(Arch::alignment()) static const uint8_t alphaMaski[] = { + 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, + 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, + }; + + using batch_type = xsimd::batch<uint8_t, Arch>; + constexpr size_t batch_size = batch_type::size; + static_assert(batch_size == 16); + + batch_type greenMask = batch_type::load_aligned(greenMaski); + batch_type alphaMask = batch_type::load_aligned(alphaMaski); + + for (int32_t i = 0; i < size.height; ++i) { + int32_t j = 0; + // Loop single pixels until at 4 byte alignment. + while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { + *((uint32_t*)blackData) = + RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), + *reinterpret_cast<uint32_t*>(whiteData)); + blackData += 4; + whiteData += 4; + j++; + } + // This extra loop allows the compiler to do some more clever registry + // management and makes it about 5% faster than with only the 4 pixel + // at a time loop. + for (; j < size.width - 8; j += 8) { + auto black1 = batch_type::load_aligned(blackData); + auto white1 = batch_type::load_aligned(whiteData); + auto black2 = batch_type::load_aligned(blackData + batch_size); + auto white2 = batch_type::load_aligned(whiteData + batch_size); + + // Execute the same instructions as described in RecoverPixel, only + // using an SSE2 packed saturated subtract. + white1 = xsimd::ssub(white1, black1); + white2 = xsimd::ssub(white2, black2); + white1 = xsimd::ssub(greenMask, white1); + white2 = xsimd::ssub(greenMask, white2); + // Producing the final black pixel in an XMM register and storing + // that is actually faster than doing a masked store since that + // does an unaligned storage. We have the black pixel in a register + // anyway. + black1 = xsimd::bitwise_andnot(black1, alphaMask); + black2 = xsimd::bitwise_andnot(black2, alphaMask); + white1 = xsimd::slide_left<2>(white1); + white2 = xsimd::slide_left<2>(white2); + white1 &= alphaMask; + white2 &= alphaMask; + black1 |= white1; + black2 |= white2; + + black1.store_aligned(blackData); + black2.store_aligned(blackData + batch_size); + blackData += 2 * batch_size; + whiteData += 2 * batch_size; + } + for (; j < size.width - 4; j += 4) { + auto black = batch_type::load_aligned(blackData); + auto white = batch_type::load_aligned(whiteData); + + white = xsimd::ssub(white, black); + white = xsimd::ssub(greenMask, white); + black = xsimd::bitwise_andnot(black, alphaMask); + white = xsimd::slide_left<2>(white); + white &= alphaMask; + black |= white; + black.store_aligned(blackData); + blackData += batch_size; + whiteData += batch_size; + } + // Loop single pixels until we're done. + while (j < size.width) { + *((uint32_t*)blackData) = + RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), + *reinterpret_cast<uint32_t*>(whiteData)); + blackData += 4; + whiteData += 4; + j++; + } + blackData += blackSurf->Stride() - j * 4; + whiteData += whiteSurf->Stride() - j * 4; + } + + blackSurf->MarkDirty(); + + return true; +} +#endif |