diff options
Diffstat (limited to 'gfx/ycbcr')
-rw-r--r-- | gfx/ycbcr/LICENSE | 27 | ||||
-rw-r--r-- | gfx/ycbcr/README | 8 | ||||
-rw-r--r-- | gfx/ycbcr/YCbCrUtils.cpp | 387 | ||||
-rw-r--r-- | gfx/ycbcr/YCbCrUtils.h | 49 | ||||
-rw-r--r-- | gfx/ycbcr/chromium_types.h | 50 | ||||
-rw-r--r-- | gfx/ycbcr/moz.build | 66 | ||||
-rw-r--r-- | gfx/ycbcr/scale_yuv_argb.cpp | 1132 | ||||
-rw-r--r-- | gfx/ycbcr/scale_yuv_argb.h | 39 | ||||
-rw-r--r-- | gfx/ycbcr/ycbcr_to_rgb565.cpp | 672 | ||||
-rw-r--r-- | gfx/ycbcr/ycbcr_to_rgb565.h | 72 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_convert.cpp | 577 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_convert.h | 123 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_convert_arm.cpp | 232 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_convert_mmx.cpp | 45 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_convert_sse2.cpp | 47 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row.h | 154 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_arm.s | 304 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_c.cpp | 133 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_other.cpp | 34 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_posix.cpp | 914 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_table.cpp | 233 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_win.cpp | 506 | ||||
-rw-r--r-- | gfx/ycbcr/yuv_row_win64.cpp | 205 |
23 files changed, 6009 insertions, 0 deletions
diff --git a/gfx/ycbcr/LICENSE b/gfx/ycbcr/LICENSE new file mode 100644 index 0000000000..8dc35041de --- /dev/null +++ b/gfx/ycbcr/LICENSE @@ -0,0 +1,27 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/gfx/ycbcr/README b/gfx/ycbcr/README new file mode 100644 index 0000000000..8910a2a2b2 --- /dev/null +++ b/gfx/ycbcr/README @@ -0,0 +1,8 @@ +This color conversion code is from the Chromium open source project available here: + +http://code.google.com/chromium/ + +The code comes from svn revision 63840 on 2010-10-26. + +It has been superseded upstream by libyuv (which is spawned off it). Bug 791941 is about +trying to replace this code with libyuv. diff --git a/gfx/ycbcr/YCbCrUtils.cpp b/gfx/ycbcr/YCbCrUtils.cpp new file mode 100644 index 0000000000..b2b5a4f293 --- /dev/null +++ b/gfx/ycbcr/YCbCrUtils.cpp @@ -0,0 +1,387 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/EndianUtils.h" +#include "gfx2DGlue.h" +#include "mozilla/gfx/Swizzle.h" + +#include "YCbCrUtils.h" +#include "yuv_convert.h" +#include "ycbcr_to_rgb565.h" +#include "libyuv.h" + +namespace mozilla { +namespace gfx { + +// clang-format off + +static YUVType GetYUVType(const layers::PlanarYCbCrData& aData) { + switch (aData.mChromaSubsampling) { + case ChromaSubsampling::FULL: + return aData.mCbCrStride > 0 ? YV24 : Y8; + case ChromaSubsampling::HALF_WIDTH: + return YV16; + case ChromaSubsampling::HALF_WIDTH_AND_HEIGHT: + return YV12; + } + MOZ_CRASH("Unknown chroma subsampling"); +} + +void +GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData, + SurfaceFormat& aSuggestedFormat, + IntSize& aSuggestedSize) +{ + YUVType yuvtype = GetYUVType(aData); + + // 'prescale' is true if the scaling is to be done as part of the + // YCbCr to RGB conversion rather than on the RGB data when rendered. + bool prescale = aSuggestedSize.width > 0 && aSuggestedSize.height > 0 && + aSuggestedSize != aData.mPictureRect.Size(); + + if (aSuggestedFormat == SurfaceFormat::R5G6B5_UINT16) { +#if defined(HAVE_YCBCR_TO_RGB565) + if (prescale && + !IsScaleYCbCrToRGB565Fast(aData.mPictureRect.x, + aData.mPictureRect.y, + aData.mPictureRect.width, + aData.mPictureRect.height, + aSuggestedSize.width, + aSuggestedSize.height, + yuvtype, + FILTER_BILINEAR) && + IsConvertYCbCrToRGB565Fast(aData.mPictureRect.x, + aData.mPictureRect.y, + aData.mPictureRect.width, + aData.mPictureRect.height, + yuvtype)) { + prescale = false; + } +#else + // yuv2rgb16 function not available + aSuggestedFormat = SurfaceFormat::B8G8R8X8; +#endif + } + else if (aSuggestedFormat != SurfaceFormat::B8G8R8X8) { + // No other formats are currently supported. + aSuggestedFormat = SurfaceFormat::B8G8R8X8; + } + if (aSuggestedFormat == SurfaceFormat::B8G8R8X8) { + /* ScaleYCbCrToRGB32 does not support a picture offset, nor 4:4:4 data. + See bugs 639415 and 640073. */ + if (aData.mPictureRect.TopLeft() != IntPoint(0, 0) || yuvtype == YV24) + prescale = false; + } + if (!prescale) { + aSuggestedSize = aData.mPictureRect.Size(); + } +} + +static inline void +ConvertYCbCr16to8Line(uint8_t* aDst, + int aStride, + const uint16_t* aSrc, + int aStride16, + int aWidth, + int aHeight, + int aBitDepth) +{ + // These values from from the comment on from libyuv's Convert16To8Row_C: + int scale; + switch (aBitDepth) { + case 10: + scale = 16384; + break; + case 12: + scale = 4096; + break; + case 16: + scale = 256; + break; + default: + MOZ_ASSERT_UNREACHABLE("invalid bit depth value"); + return; + } + + libyuv::Convert16To8Plane(aSrc, aStride16, aDst, aStride, scale, aWidth, aHeight); +} + +void +ConvertYCbCrToRGBInternal(const layers::PlanarYCbCrData& aData, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, + unsigned char* aDestBuffer, + int32_t aStride) +{ + // ConvertYCbCrToRGB et al. assume the chroma planes are rounded up if the + // luma plane is odd sized. Monochrome images have 0-sized CbCr planes + YUVType yuvtype = GetYUVType(aData); + + // Used if converting to 8 bits YUV. + UniquePtr<uint8_t[]> yChannel; + UniquePtr<uint8_t[]> cbChannel; + UniquePtr<uint8_t[]> crChannel; + layers::PlanarYCbCrData dstData; + const layers::PlanarYCbCrData& srcData = + aData.mColorDepth == ColorDepth::COLOR_8 ? aData : dstData; + + if (aData.mColorDepth != ColorDepth::COLOR_8) { + // Convert to 8 bits data first. + dstData.mPictureRect = aData.mPictureRect; + // We align the destination stride to 32 bytes, so that libyuv can use + // SSE optimised code. + auto ySize = aData.YDataSize(); + auto cbcrSize = aData.CbCrDataSize(); + dstData.mYStride = (ySize.width + 31) & ~31; + dstData.mCbCrStride = (cbcrSize.width + 31) & ~31; + dstData.mYUVColorSpace = aData.mYUVColorSpace; + dstData.mColorDepth = ColorDepth::COLOR_8; + dstData.mColorRange = aData.mColorRange; + dstData.mChromaSubsampling = aData.mChromaSubsampling; + + size_t yMemorySize = GetAlignedStride<1>(dstData.mYStride, ySize.height); + size_t cbcrMemorySize = + GetAlignedStride<1>(dstData.mCbCrStride, cbcrSize.height); + if (yMemorySize == 0) { + MOZ_DIAGNOSTIC_ASSERT(cbcrMemorySize == 0, "CbCr without Y makes no sense"); + return; + } + yChannel = MakeUnique<uint8_t[]>(yMemorySize); + + dstData.mYChannel = yChannel.get(); + + int bitDepth = BitDepthForColorDepth(aData.mColorDepth); + + ConvertYCbCr16to8Line(dstData.mYChannel, + dstData.mYStride, + reinterpret_cast<uint16_t*>(aData.mYChannel), + aData.mYStride / 2, + ySize.width, + ySize.height, + bitDepth); + + if (cbcrMemorySize) { + cbChannel = MakeUnique<uint8_t[]>(cbcrMemorySize); + crChannel = MakeUnique<uint8_t[]>(cbcrMemorySize); + + dstData.mCbChannel = cbChannel.get(); + dstData.mCrChannel = crChannel.get(); + + ConvertYCbCr16to8Line(dstData.mCbChannel, + dstData.mCbCrStride, + reinterpret_cast<uint16_t*>(aData.mCbChannel), + aData.mCbCrStride / 2, + cbcrSize.width, + cbcrSize.height, + bitDepth); + + ConvertYCbCr16to8Line(dstData.mCrChannel, + dstData.mCbCrStride, + reinterpret_cast<uint16_t*>(aData.mCrChannel), + aData.mCbCrStride / 2, + cbcrSize.width, + cbcrSize.height, + bitDepth); + } + } + + // Convert from YCbCr to RGB now, scaling the image if needed. + if (aDestSize != srcData.mPictureRect.Size()) { +#if defined(HAVE_YCBCR_TO_RGB565) + if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) { + ScaleYCbCrToRGB565(srcData.mYChannel, + srcData.mCbChannel, + srcData.mCrChannel, + aDestBuffer, + srcData.mPictureRect.x, + srcData.mPictureRect.y, + srcData.mPictureRect.width, + srcData.mPictureRect.height, + aDestSize.width, + aDestSize.height, + srcData.mYStride, + srcData.mCbCrStride, + aStride, + yuvtype, + FILTER_BILINEAR); + } else +#endif + ScaleYCbCrToRGB32(srcData.mYChannel, // + srcData.mCbChannel, + srcData.mCrChannel, + aDestBuffer, + srcData.mPictureRect.width, + srcData.mPictureRect.height, + aDestSize.width, + aDestSize.height, + srcData.mYStride, + srcData.mCbCrStride, + aStride, + yuvtype, + srcData.mYUVColorSpace, + FILTER_BILINEAR); + } else { // no prescale +#if defined(HAVE_YCBCR_TO_RGB565) + if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) { + ConvertYCbCrToRGB565(srcData.mYChannel, + srcData.mCbChannel, + srcData.mCrChannel, + aDestBuffer, + srcData.mPictureRect.x, + srcData.mPictureRect.y, + srcData.mPictureRect.width, + srcData.mPictureRect.height, + srcData.mYStride, + srcData.mCbCrStride, + aStride, + yuvtype); + } else // aDestFormat != SurfaceFormat::R5G6B5_UINT16 +#endif + ConvertYCbCrToRGB32(srcData.mYChannel, // + srcData.mCbChannel, + srcData.mCrChannel, + aDestBuffer, + srcData.mPictureRect.x, + srcData.mPictureRect.y, + srcData.mPictureRect.width, + srcData.mPictureRect.height, + srcData.mYStride, + srcData.mCbCrStride, + aStride, + yuvtype, + srcData.mYUVColorSpace, + srcData.mColorRange); + } +} + +void ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, unsigned char* aDestBuffer, + int32_t aStride) { + ConvertYCbCrToRGBInternal(aData, aDestFormat, aDestSize, aDestBuffer, + aStride); +#if MOZ_BIG_ENDIAN() + // libyuv makes endian-correct result, which needs to be swapped to BGRX + if (aDestFormat != SurfaceFormat::R5G6B5_UINT16) { + gfx::SwizzleData(aDestBuffer, aStride, gfx::SurfaceFormat::X8R8G8B8, + aDestBuffer, aStride, gfx::SurfaceFormat::B8G8R8X8, + aDestSize); + } +#endif +} + +void FillAlphaToRGBA(const uint8_t* aAlpha, const int32_t aAlphaStride, + uint8_t* aBuffer, const int32_t aWidth, + const int32_t aHeight, const gfx::SurfaceFormat& aFormat) { + MOZ_ASSERT(aAlphaStride >= aWidth); + MOZ_ASSERT(aFormat == + SurfaceFormat::B8G8R8A8); // required for SurfaceFormatBit::OS_A + + const int bpp = BytesPerPixel(aFormat); + const size_t rgbaStride = aWidth * bpp; + const uint8_t* src = aAlpha; + for (int32_t h = 0; h < aHeight; ++h) { + size_t offset = static_cast<size_t>(SurfaceFormatBit::OS_A) / 8; + for (int32_t w = 0; w < aWidth; ++w) { + aBuffer[offset] = src[w]; + offset += bpp; + } + src += aAlphaStride; + aBuffer += rgbaStride; + } +} + +void ConvertYCbCrAToARGB(const layers::PlanarYCbCrData& aYCbCr, + const layers::PlanarAlphaData& aAlpha, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, unsigned char* aDestBuffer, + int32_t aStride, PremultFunc premultiplyAlphaOp) { + // libyuv makes endian-correct result, so the format needs to be B8G8R8A8. + MOZ_ASSERT(aDestFormat == SurfaceFormat::B8G8R8A8); + MOZ_ASSERT(aAlpha.mSize == aYCbCr.YDataSize()); + + // libyuv has libyuv::I420AlphaToARGB, but lacks support for 422 and 444. + // Until that's added, we'll rely on our own code to handle this more + // generally, rather than have a special case and more redundant code. + + UniquePtr<uint8_t[]> alphaChannel; + int32_t alphaStride8bpp = 0; + uint8_t* alphaChannel8bpp = nullptr; + + // This function converts non-8-bpc images to 8-bpc. (Bug 1682322) + ConvertYCbCrToRGBInternal(aYCbCr, aDestFormat, aDestSize, aDestBuffer, + aStride); + + if (aYCbCr.mColorDepth != ColorDepth::COLOR_8) { + // These two lines are borrowed from ConvertYCbCrToRGBInternal, since + // there's not a very elegant way of sharing the logic that I can see + alphaStride8bpp = (aAlpha.mSize.width + 31) & ~31; + size_t alphaSize = + GetAlignedStride<1>(alphaStride8bpp, aAlpha.mSize.height); + + alphaChannel = MakeUnique<uint8_t[]>(alphaSize); + + ConvertYCbCr16to8Line(alphaChannel.get(), alphaStride8bpp, + reinterpret_cast<uint16_t*>(aAlpha.mChannel), + aYCbCr.mYStride / 2, aAlpha.mSize.width, + aAlpha.mSize.height, + BitDepthForColorDepth(aYCbCr.mColorDepth)); + + alphaChannel8bpp = alphaChannel.get(); + } else { + alphaStride8bpp = aYCbCr.mYStride; + alphaChannel8bpp = aAlpha.mChannel; + } + + MOZ_ASSERT(alphaStride8bpp != 0); + MOZ_ASSERT(alphaChannel8bpp); + + FillAlphaToRGBA(alphaChannel8bpp, alphaStride8bpp, aDestBuffer, + aYCbCr.mPictureRect.width, aYCbCr.mPictureRect.height, aDestFormat); + + if (premultiplyAlphaOp) { + DebugOnly<int> err = + premultiplyAlphaOp(aDestBuffer, aStride, aDestBuffer, aStride, + aYCbCr.mPictureRect.width, aYCbCr.mPictureRect.height); + MOZ_ASSERT(!err); + } + +#if MOZ_BIG_ENDIAN() + // libyuv makes endian-correct result, which needs to be swapped to BGRA + gfx::SwizzleData(aDestBuffer, aStride, gfx::SurfaceFormat::A8R8G8B8, + aDestBuffer, aStride, gfx::SurfaceFormat::B8G8R8A8, + aYCbCr.mPictureRect.Size()); +#endif +} + +void +ConvertI420AlphaToARGB(const uint8_t* aSrcY, + const uint8_t* aSrcU, + const uint8_t* aSrcV, + const uint8_t* aSrcA, + int aSrcStrideYA, int aSrcStrideUV, + uint8_t* aDstARGB, int aDstStrideARGB, + int aWidth, int aHeight) { + + ConvertI420AlphaToARGB32(aSrcY, + aSrcU, + aSrcV, + aSrcA, + aDstARGB, + aWidth, + aHeight, + aSrcStrideYA, + aSrcStrideUV, + aDstStrideARGB); +#if MOZ_BIG_ENDIAN() + // libyuv makes endian-correct result, which needs to be swapped to BGRA + gfx::SwizzleData(aDstARGB, aDstStrideARGB, gfx::SurfaceFormat::A8R8G8B8, + aDstARGB, aDstStrideARGB, gfx::SurfaceFormat::B8G8R8A8, + IntSize(aWidth, aHeight)); +#endif +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/YCbCrUtils.h b/gfx/ycbcr/YCbCrUtils.h new file mode 100644 index 0000000000..b63e4dabe9 --- /dev/null +++ b/gfx/ycbcr/YCbCrUtils.h @@ -0,0 +1,49 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef Y_CB_CR_UTILS_H_ +#define Y_CB_CR_UTILS_H_ + +#include "mozilla/gfx/Types.h" +#include "ImageContainer.h" + +namespace mozilla { +namespace gfx { + +void +GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData, + SurfaceFormat& aSuggestedFormat, + IntSize& aSuggestedSize); + +void +ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, + unsigned char* aDestBuffer, + int32_t aStride); + +using PremultFunc = int (*)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, + int height); + +void ConvertYCbCrAToARGB(const layers::PlanarYCbCrData& aYCbCr, + const layers::PlanarAlphaData& aAlpha, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, + unsigned char* aDestBuffer, + int32_t aStride, PremultFunc premultiplyAlphaOp); + +void +ConvertI420AlphaToARGB(const uint8_t* aSrcY, + const uint8_t* aSrcU, + const uint8_t* aSrcV, + const uint8_t* aSrcA, + int aSrcStrideYA, int aSrcStrideUV, + uint8_t* aDstARGB, int aDstStrideARGB, + int aWidth, int aHeight); +} // namespace gfx +} // namespace mozilla + +#endif /* Y_CB_CR_UTILS_H_ */ diff --git a/gfx/ycbcr/chromium_types.h b/gfx/ycbcr/chromium_types.h new file mode 100644 index 0000000000..13f92975b5 --- /dev/null +++ b/gfx/ycbcr/chromium_types.h @@ -0,0 +1,50 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef GFX_CHROMIUMTYPES_H +#define GFX_CHROMIUMTYPES_H + +#include <stdint.h> + +#include "libyuv/basic_types.h" + +// From Chromium build_config.h: +// Processor architecture detection. For more info on what's defined, see: +// http://msdn.microsoft.com/en-us/library/b0084kay.aspx +// http://www.agner.org/optimize/calling_conventions.pdf +// or with gcc, run: "echo | gcc -E -dM -" +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86_64 1 +#define ARCH_CPU_64_BITS 1 +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86_32 1 +#define ARCH_CPU_X86 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#define ARCH_CPU_ARMEL 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__ppc__) || defined(__powerpc) || defined(__PPC__) +#define ARCH_CPU_PPC_FAMILY 1 +#define ARCH_CPU_PPC 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__sparc) +#define ARCH_CPU_SPARC_FAMILY 1 +#define ARCH_CPU_SPARC 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__sparcv9) +#define ARCH_CPU_SPARC_FAMILY 1 +#define ARCH_CPU_SPARC 1 +#define ARCH_CPU_64_BITS 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define ARCH_CPU_AARCH64_FAMILY 1 +#define ARCH_CPU_AARCH64 1 +#define ARCH_CPU_64_BITS 1 +#else +#warning Please add support for your architecture in chromium_types.h +#endif + +#endif // GFX_CHROMIUMTYPES_H diff --git a/gfx/ycbcr/moz.build b/gfx/ycbcr/moz.build new file mode 100644 index 0000000000..c643fbaf40 --- /dev/null +++ b/gfx/ycbcr/moz.build @@ -0,0 +1,66 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXPORTS += [ + 'YCbCrUtils.h', +] + +UNIFIED_SOURCES += [ + 'scale_yuv_argb.cpp', + 'ycbcr_to_rgb565.cpp', + 'YCbCrUtils.cpp', + 'yuv_convert.cpp', + 'yuv_row_c.cpp', + 'yuv_row_table.cpp', +] + +if CONFIG['INTEL_ARCHITECTURE']: + # These files use MMX and SSE2 intrinsics, so they need special compile flags + # on some compilers. + SOURCES += ['yuv_convert_sse2.cpp'] + SOURCES['yuv_convert_sse2.cpp'].flags += CONFIG['SSE2_FLAGS'] + + # MSVC doesn't support MMX when targeting AMD64. + if CONFIG['CC_TYPE'] == 'clang-cl': + if CONFIG['CPU_ARCH'] == 'x86': + SOURCES += [ + 'yuv_convert_mmx.cpp', + ] + else: + SOURCES += ['yuv_convert_mmx.cpp'] + SOURCES['yuv_convert_mmx.cpp'].flags += CONFIG['MMX_FLAGS'] + +if CONFIG['CC_TYPE'] == 'clang-cl': + if CONFIG['CPU_ARCH'] == 'x86_64' or \ + (CONFIG['CPU_ARCH'] == 'x86' and CONFIG['CC_TYPE'] == 'clang-cl'): + SOURCES += [ + 'yuv_row_win64.cpp', + ] + else: + SOURCES += [ + 'yuv_row_win.cpp', + ] +elif CONFIG['OS_ARCH'] in ('Linux', 'SunOS', 'Darwin', 'DragonFly', + 'FreeBSD', 'NetBSD', 'OpenBSD'): + SOURCES += [ + 'yuv_row_posix.cpp', + ] +else: + SOURCES += [ + 'yuv_row_other.cpp', + ] + +if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['HAVE_ARM_NEON']: + SOURCES += [ + 'yuv_row_arm.s', + ] + SOURCES += [ + 'yuv_convert_arm.cpp', + ] + +LOCAL_INCLUDES += ['/media/libyuv/libyuv/include'] + +FINAL_LIBRARY = 'xul' diff --git a/gfx/ycbcr/scale_yuv_argb.cpp b/gfx/ycbcr/scale_yuv_argb.cpp new file mode 100644 index 0000000000..2a103fb61e --- /dev/null +++ b/gfx/ycbcr/scale_yuv_argb.cpp @@ -0,0 +1,1132 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * Copyright 2016 Mozilla Foundation + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/convert_argb.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" +#include "libyuv/video_common.h" + +#include "mozilla/gfx/Types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// YUV to RGB conversion and scaling functions were implemented by referencing +// scale_argb.cc +// +// libyuv already has ScaleYUVToARGBBilinearUp(), but its implementation is not +// completed yet. Implementations of the functions are based on it. +// At first, ScaleYUVToARGBBilinearUp() was implemented by modifying the +// libyuv's one. Then all another functions were implemented similarly. +// +// Function relationship between yuv_convert.cpp and scale_argb.cc are like +// the followings +// - ScaleYUVToARGBDown2() <-- ScaleARGBDown2() +// - ScaleYUVToARGBDownEven() <-- ScaleARGBDownEven() +// - ScaleYUVToARGBBilinearDown() <-- ScaleARGBBilinearDown() +// - ScaleYUVToARGBBilinearUp() <-- ScaleARGBBilinearUp() and ScaleYUVToARGBBilinearUp() in libyuv +// - ScaleYUVToARGBSimple() <-- ScaleARGBSimple() +// - ScaleYUVToARGB() <-- ScaleARGB() // Removed some function calls for simplicity. +// - YUVToARGBScale() <-- ARGBScale() +// +// Callings and selections of InterpolateRow() and ScaleARGBFilterCols() were +// kept as same as possible. +// +// The followings changes were done to each scaling functions. +// +// -[1] Allocate YUV conversion buffer and use it as source buffer of scaling. +// Its usage is borrowed from the libyuv's ScaleYUVToARGBBilinearUp(). +// -[2] Conversion from YUV to RGB was abstracted as YUVBuferIter. +// It is for handling multiple yuv color formats. +// -[3] Modified scaling functions as to handle YUV conversion buffer and +// use YUVBuferIter. +// -[4] Color conversion function selections in YUVBuferIter were borrowed from +// I444ToARGBMatrix(), I422ToARGBMatrix() and I420ToARGBMatrix() + +typedef mozilla::gfx::YUVColorSpace YUVColorSpace; + +struct YUVBuferIter { + int src_width; + int src_height; + int src_stride_y; + int src_stride_u; + int src_stride_v; + const uint8_t* src_y; + const uint8_t* src_u; + const uint8_t* src_v; + + uint32_t src_fourcc; + const struct YuvConstants* yuvconstants; + int y_index; + const uint8_t* src_row_y; + const uint8_t* src_row_u; + const uint8_t* src_row_v; + + void (*YUVToARGBRow)(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); + void (*MoveTo)(YUVBuferIter& iter, int y_index); + void (*MoveToNextRow)(YUVBuferIter& iter); +}; + +void YUVBuferIter_InitI422(YUVBuferIter& iter) { + iter.YUVToARGBRow = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + iter.YUVToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + iter.YUVToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(iter.src_width, 16)) { + iter.YUVToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + iter.YUVToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(iter.src_width, 4) && + IS_ALIGNED(iter.src_y, 4) && IS_ALIGNED(iter.src_stride_y, 4) && + IS_ALIGNED(iter.src_u, 2) && IS_ALIGNED(iter.src_stride_u, 2) && + IS_ALIGNED(iter.src_v, 2) && IS_ALIGNED(iter.src_stride_v, 2) { + // Always satisfy IS_ALIGNED(argb_cnv_row, 4) && IS_ALIGNED(argb_cnv_rowstride, 4) + iter.YUVToARGBRow = I422ToARGBRow_DSPR2; + } +#endif +} + +void YUVBuferIter_InitI444(YUVBuferIter& iter) { + iter.YUVToARGBRow = I444ToARGBRow_C; +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + iter.YUVToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + iter.YUVToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(iter.src_width, 16)) { + iter.YUVToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + iter.YUVToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +} + + +static void YUVBuferIter_MoveToForI444(YUVBuferIter& iter, int y_index) { + iter.y_index = y_index; + iter.src_row_y = iter.src_y + y_index * iter.src_stride_y; + iter.src_row_u = iter.src_u + y_index * iter.src_stride_u; + iter.src_row_v = iter.src_v + y_index * iter.src_stride_v; +} + +static void YUVBuferIter_MoveToNextRowForI444(YUVBuferIter& iter) { + iter.src_row_y += iter.src_stride_y; + iter.src_row_u += iter.src_stride_u; + iter.src_row_v += iter.src_stride_v; + iter.y_index++; +} + +static void YUVBuferIter_MoveToForI422(YUVBuferIter& iter, int y_index) { + iter.y_index = y_index; + iter.src_row_y = iter.src_y + y_index * iter.src_stride_y; + iter.src_row_u = iter.src_u + y_index * iter.src_stride_u; + iter.src_row_v = iter.src_v + y_index * iter.src_stride_v; +} + +static void YUVBuferIter_MoveToNextRowForI422(YUVBuferIter& iter) { + iter.src_row_y += iter.src_stride_y; + iter.src_row_u += iter.src_stride_u; + iter.src_row_v += iter.src_stride_v; + iter.y_index++; +} + +static void YUVBuferIter_MoveToForI420(YUVBuferIter& iter, int y_index) { + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int uv_y_index = y_index >> kYShift; + + iter.y_index = y_index; + iter.src_row_y = iter.src_y + y_index * iter.src_stride_y; + iter.src_row_u = iter.src_u + uv_y_index * iter.src_stride_u; + iter.src_row_v = iter.src_v + uv_y_index * iter.src_stride_v; +} + +static void YUVBuferIter_MoveToNextRowForI420(YUVBuferIter& iter) { + iter.src_row_y += iter.src_stride_y; + if (iter.y_index & 1) { + iter.src_row_u += iter.src_stride_u; + iter.src_row_v += iter.src_stride_v; + } + iter.y_index++; +} + +static __inline void YUVBuferIter_ConvertToARGBRow(YUVBuferIter& iter, uint8_t* argb_row) { + iter.YUVToARGBRow(iter.src_row_y, iter.src_row_u, iter.src_row_v, argb_row, iter.yuvconstants, iter.src_width); +} + +void YUVBuferIter_Init(YUVBuferIter& iter, uint32_t src_fourcc, YUVColorSpace yuv_color_space) { + iter.src_fourcc = src_fourcc; + iter.y_index = 0; + iter.src_row_y = iter.src_y; + iter.src_row_u = iter.src_u; + iter.src_row_v = iter.src_v; + switch (yuv_color_space) { + case YUVColorSpace::BT2020: + iter.yuvconstants = &kYuv2020Constants; + break; + case YUVColorSpace::BT709: + iter.yuvconstants = &kYuvH709Constants; + break; + default: + iter.yuvconstants = &kYuvI601Constants; + } + + if (src_fourcc == FOURCC_I444) { + YUVBuferIter_InitI444(iter); + iter.MoveTo = YUVBuferIter_MoveToForI444; + iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI444; + } else if(src_fourcc == FOURCC_I422){ + YUVBuferIter_InitI422(iter); + iter.MoveTo = YUVBuferIter_MoveToForI422; + iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI422; + } else { + assert(src_fourcc == FOURCC_I420); // Should be FOURCC_I420 + YUVBuferIter_InitI422(iter); + iter.MoveTo = YUVBuferIter_MoveToForI420; + iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI420; + } +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleYUVToARGBDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) { + int j; + + // Allocate 2 rows of ARGB for source conversion. + const int kRowSize = (src_width * 4 + 15) & ~15; + align_buffer_64(argb_cnv_row, kRowSize * 2); + uint8_t* argb_cnv_rowptr = argb_cnv_row; + int argb_cnv_rowstride = kRowSize; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + filtering == kFilterNone ? ScaleARGBRowDown2_C : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : + ScaleARGBRowDown2Box_C); + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + int yi = y >> 16; + iter.MoveTo(iter, yi); + ptrdiff_t x_offset; + if (filtering == kFilterBilinear) { + x_offset = (x >> 16) * 4; + } else { + x_offset = ((x >> 16) - 1) * 4; + } +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : + ScaleARGBRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); + } + } + +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : + ScaleARGBRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : + ScaleARGBRowDown2Box_NEON); + } + } +#endif + + const int dyi = dy >> 16; + int lastyi = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + dyi) < (src_height - 1)) { + iter.MoveTo(iter, yi + dyi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + } + + if (filtering == kFilterLinear) { + argb_cnv_rowstride = 0; + } + const int max_yi = src_height - 1; + const int max_yi_minus_dyi = max_yi - dyi; + for (j = 0; j < dst_height; ++j) { + if (yi != lastyi) { + if (yi > max_yi) { + yi = max_yi; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + lastyi = yi; + } else { + // Prepare current row + if (yi == iter.y_index) { + argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride; + argb_cnv_rowstride = - argb_cnv_rowstride; + } else { + iter.MoveTo(iter, yi); + argb_cnv_rowptr = argb_cnv_row; + argb_cnv_rowstride = kRowSize; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + } + // Prepare next row if necessary + if (iter.y_index < max_yi) { + int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi; + iter.MoveTo(iter, next_yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + lastyi = yi; + } + } + } + ScaleARGBRowDown2(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, dst_argb, dst_width); + dst_argb += dst_stride_argb; + yi += dyi; + } + + free_aligned_buffer_64(argb_cnv_row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleYUVToARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) { + int j; + // Allocate 2 rows of ARGB for source conversion. + const int kRowSize = (src_width * 4 + 15) & ~15; + align_buffer_64(argb_cnv_row, kRowSize * 2); + uint8_t* argb_cnv_rowptr = argb_cnv_row; + int argb_cnv_rowstride = kRowSize; + + int col_step = dx >> 16; + void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, + int src_step, uint8_t* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + int yi = y >> 16; + const ptrdiff_t x_offset = (x >> 16) * 4; + +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : + ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : + ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : + ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : + ScaleARGBRowDownEven_NEON; + } + } +#endif + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + + const int dyi = dy >> 16; + int lastyi = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + dyi) < (src_height - 1)) { + iter.MoveTo(iter, yi + dyi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + } + + if (filtering == kFilterLinear) { + argb_cnv_rowstride = 0; + } + const int max_yi = src_height - 1; + const int max_yi_minus_dyi = max_yi - dyi; + for (j = 0; j < dst_height; ++j) { + if (yi != lastyi) { + if (yi > max_yi) { + yi = max_yi; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + lastyi = yi; + } else { + // Prepare current row + if (yi == iter.y_index) { + argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride; + argb_cnv_rowstride = - argb_cnv_rowstride; + } else { + iter.MoveTo(iter, yi); + argb_cnv_rowptr = argb_cnv_row; + argb_cnv_rowstride = kRowSize; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + } + // Prepare next row if necessary + if (iter.y_index < max_yi) { + int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi; + iter.MoveTo(iter, next_yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + lastyi = yi; + } + } + } + ScaleARGBRowDownEven(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, col_step, dst_argb, dst_width); + dst_argb += dst_stride_argb; + yi += dyi; + } + free_aligned_buffer_64(argb_cnv_row); +} + +// Scale YUV to ARGB down with bilinear interpolation. +static void ScaleYUVToARGBBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) { + int j; + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + const ptrdiff_t xl_offset = xl * 4; + x -= (int)(xl << 16); + + // Allocate 2 row of ARGB for source conversion. + const int kRowSize = (src_width * 4 + 15) & ~15; + align_buffer_64(argb_cnv_row, kRowSize * 2); + uint8_t* argb_cnv_rowptr = argb_cnv_row; + int argb_cnv_rowstride = kRowSize; + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)) { + InterpolateRow = InterpolateRow_Any_DSPR2; + if (IS_ALIGNED(clip_src_width, 4)) { + InterpolateRow = InterpolateRow_DSPR2; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif + + int yi = y >> 16; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + iter.MoveTo(iter, yi); + + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + align_buffer_64(row, clip_src_width * 4); + + int lastyi = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + 1) < src_height) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + } + + const int max_y = (src_height - 1) << 16; + const int max_yi = src_height - 1; + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lastyi) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + lastyi = yi; + } else { + // Prepare current row + if (yi == iter.y_index) { + argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride; + argb_cnv_rowstride = - argb_cnv_rowstride; + } else { + iter.MoveTo(iter, yi); + argb_cnv_rowptr = argb_cnv_row; + argb_cnv_rowstride = kRowSize; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + } + // Prepare next row if necessary + if (iter.y_index < max_yi) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + lastyi = yi; + } + } + } + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, argb_cnv_rowptr + xl_offset, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, argb_cnv_rowptr + xl_offset, argb_cnv_rowstride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(argb_cnv_row); +} + +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) { + int j; + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_cnv_row, src_width * 4); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_DSPR2; + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + int yi = y >> 16; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + iter.MoveTo(iter, yi); + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lastyi = yi; + + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx); + + if (filtering == kFilterLinear) { + rowstride = 0; + } + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + 1) < src_height) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx); + }else { + rowstride = 0; + } + } + + const int max_yi = src_height - 1; + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lastyi) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx); + } else { + // Prepare next row if necessary + if (yi < max_yi) { + iter.MoveToNextRow(iter); + rowptr += rowstride; + rowstride = -rowstride; + // TODO(fbarchard): Convert the clipped region of row. + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx); + } else { + rowstride = 0; + } + } + lastyi = yi; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(argb_cnv_row); +} + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleYUVToARGBSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, int dx, int y, int dy, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) { + int j; + void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_cnv_row, src_width * 4); + +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + int yi = y >> 16; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + iter.MoveTo(iter, yi); + + int lasty = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + lasty = yi; + } + ScaleARGBCols(dst_argb, argb_cnv_row, dst_width, x, dx); + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(argb_cnv_row); +} + +static void YUVToARGBCopy(const uint8_t* src_y, int src_stride_y, + const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + int src_width, int src_height, + uint8_t* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) +{ + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + + for (int j = 0; j < dst_height; ++j) { + YUVBuferIter_ConvertToARGBRow(iter, dst_argb); + iter.MoveToNextRow(iter); + dst_argb += dst_stride_argb; + } +} + +static void ScaleYUVToARGB(const uint8_t* src_y, int src_stride_y, + const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + int src_width, int src_height, + uint8_t* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space) +{ + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleYUVToARGBDown2(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + ScaleYUVToARGBDownEven(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight conversion and copy. + YUVToARGBCopy(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_argb, dst_stride_argb, + dst_width, dst_height, + src_fourcc, + yuv_color_space); + return; + } + } + } + } + if (filtering && dy < 65536) { + ScaleYUVToARGBBilinearUp(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + if (filtering) { + ScaleYUVToARGBBilinearDown(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + ScaleYUVToARGBSimple(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + src_fourcc, + yuv_color_space); +} + +bool IsConvertSupported(uint32_t src_fourcc) +{ + if (src_fourcc == FOURCC_I444 || + src_fourcc == FOURCC_I422 || + src_fourcc == FOURCC_I420) { + return true; + } + return false; +} + +LIBYUV_API +int YUVToARGBScale(const uint8_t* src_y, int src_stride_y, + const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + uint32_t src_fourcc, + YUVColorSpace yuv_color_space, + int src_width, int src_height, + uint8_t* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering) +{ + if (!src_y || !src_u || !src_v || + src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + if (!IsConvertSupported(src_fourcc)) { + return -1; + } + ScaleYUVToARGB(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_argb, dst_stride_argb, + dst_width, dst_height, + filtering, + src_fourcc, + yuv_color_space); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/gfx/ycbcr/scale_yuv_argb.h b/gfx/ycbcr/scale_yuv_argb.h new file mode 100644 index 0000000000..1c4b2d16a2 --- /dev/null +++ b/gfx/ycbcr/scale_yuv_argb.h @@ -0,0 +1,39 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#include "mozilla/gfx/Types.h" // For YUVColorSpace + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +int YUVToARGBScale(const uint8_t* src_y, int src_stride_y, + const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + uint32_t src_fourcc, + mozilla::gfx::YUVColorSpace yuv_color_space, + int src_width, int src_height, + uint8_t* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ NOLINT diff --git a/gfx/ycbcr/ycbcr_to_rgb565.cpp b/gfx/ycbcr/ycbcr_to_rgb565.cpp new file mode 100644 index 0000000000..fe28245a9c --- /dev/null +++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp @@ -0,0 +1,672 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stdlib.h> +#include <limits.h> +#include "nsDebug.h" +#include "ycbcr_to_rgb565.h" +#include "nsAlgorithm.h" + + + +#ifdef HAVE_YCBCR_TO_RGB565 + +namespace mozilla { + +namespace gfx { + +/*This contains all of the parameters that are needed to convert a row. + Passing them in a struct instead of as individual parameters saves the need + to continually push onto the stack the ones that are fixed for every row.*/ +struct yuv2rgb565_row_scale_bilinear_ctx{ + uint16_t *rgb_row; + const uint8_t *y_row; + const uint8_t *u_row; + const uint8_t *v_row; + int y_yweight; + int y_pitch; + int width; + int source_x0_q16; + int source_dx_q16; + /*Not used for 4:4:4, except with chroma-nearest.*/ + int source_uv_xoffs_q16; + /*Not used for 4:4:4 or chroma-nearest.*/ + int uv_pitch; + /*Not used for 4:2:2, 4:4:4, or chroma-nearest.*/ + int uv_yweight; +}; + + + +/*This contains all of the parameters that are needed to convert a row. + Passing them in a struct instead of as individual parameters saves the need + to continually push onto the stack the ones that are fixed for every row.*/ +struct yuv2rgb565_row_scale_nearest_ctx{ + uint16_t *rgb_row; + const uint8_t *y_row; + const uint8_t *u_row; + const uint8_t *v_row; + int width; + int source_x0_q16; + int source_dx_q16; + /*Not used for 4:4:4.*/ + int source_uv_xoffs_q16; +}; + + + +typedef void (*yuv2rgb565_row_scale_bilinear_func)( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); + +typedef void (*yuv2rgb565_row_scale_nearest_func)( + const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither); + + + +//TODO: fix NEON asm for iOS +# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__) + +extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); + +void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16_t *dst, + const uint8_t *y, + const uint8_t *u, + const uint8_t *v, + int n, + int oddflag); + +#endif + + + +/*Bilinear interpolation of a single value. + This uses the exact same formulas as the asm, even though it adds some extra + shifts that do nothing but reduce accuracy.*/ +static int bislerp(const uint8_t *row, + int pitch, + int source_x, + int xweight, + int yweight) { + int a; + int b; + int c; + int d; + a = row[source_x]; + b = row[source_x+1]; + c = row[source_x+pitch]; + d = row[source_x+pitch+1]; + a = ((a<<8)+(c-a)*yweight+128)>>8; + b = ((b<<8)+(d-b)*yweight+128)>>8; + return ((a<<8)+(b-a)*xweight+128)>>8; +} + +/*Convert a single pixel from Y'CbCr to RGB565. + This uses the exact same formulas as the asm, even though we could make the + constants a lot more accurate with 32-bit wide registers.*/ +static uint16_t yu2rgb565(int y, int u, int v, int dither) { + /*This combines the constant offset that needs to be added during the Y'CbCr + conversion with a rounding offset that depends on the dither parameter.*/ + static const int DITHER_BIAS[4][3]={ + {-14240, 8704, -17696}, + {-14240+128,8704+64, -17696+128}, + {-14240+256,8704+128,-17696+256}, + {-14240+384,8704+192,-17696+384} + }; + int r; + int g; + int b; + r = clamped((74*y+102*v+DITHER_BIAS[dither][0])>>9, 0, 31); + g = clamped((74*y-25*u-52*v+DITHER_BIAS[dither][1])>>8, 0, 63); + b = clamped((74*y+129*u+DITHER_BIAS[dither][2])>>9, 0, 31); + return (uint16_t)(r<<11 | g<<5 | b); +} + +static void ScaleYCbCr420ToRGB565_Bilinear_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9; + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight); + v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight); + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr422ToRGB565_Bilinear_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9; + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight); + v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight); + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr444ToRGB565_Bilinear_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + source_x_q16 += ctx->source_dx_q16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + u = bislerp(ctx->u_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + v = bislerp(ctx->v_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr42xToRGB565_BilinearY_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr444ToRGB565_BilinearY_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>16; + source_x_q16 += ctx->source_dx_q16; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr42xToRGB565_Nearest_Row_C( + const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){ + int y; + int u; + int v; + int x; + int source_x_q16; + int source_x; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + source_x = source_x_q16>>16; + y = ctx->y_row[source_x]; + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr444ToRGB565_Nearest_Row_C( + const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){ + int y; + int u; + int v; + int x; + int source_x_q16; + int source_x; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + source_x = source_x_q16>>16; + source_x_q16 += ctx->source_dx_q16; + y = ctx->y_row[source_x]; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +void ScaleYCbCrToRGB565(const uint8_t *y_buf, + const uint8_t *u_buf, + const uint8_t *v_buf, + uint8_t *rgb_buf, + int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + ScaleFilter filter) { + int source_x0_q16; + int source_y0_q16; + int source_dx_q16; + int source_dy_q16; + int source_uv_xoffs_q16; + int source_uv_yoffs_q16; + int x_shift; + int y_shift; + int ymin; + int ymax; + int uvmin; + int uvmax; + int dither; + /*We don't support negative destination rectangles (just flip the source + instead), and for empty ones there's nothing to do.*/ + if (width <= 0 || height <= 0) + return; + /*These bounds are required to avoid 16.16 fixed-point overflow.*/ + NS_ASSERTION(source_x0 > (INT_MIN>>16) && source_x0 < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source X offset out of bounds."); + NS_ASSERTION(source_x0+source_width > (INT_MIN>>16) + && source_x0+source_width < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source width out of bounds."); + NS_ASSERTION(source_y0 > (INT_MIN>>16) && source_y0 < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source Y offset out of bounds."); + NS_ASSERTION(source_y0+source_height > (INT_MIN>>16) + && source_y0+source_height < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source height out of bounds."); + /*We require the same stride for Y' and Cb and Cr for 4:4:4 content.*/ + NS_ASSERTION(yuv_type != YV24 || y_pitch == uv_pitch, + "ScaleYCbCrToRGB565 luma stride differs from chroma for 4:4:4 content."); + /*We assume we can read outside the bounds of the input, because it makes + the code much simpler (and in practice is true: both Theora and VP8 return + padded reference frames). + In practice, we do not even _have_ the actual bounds of the source, as + we are passed a crop rectangle from it, and not the dimensions of the full + image. + This assertion will not guarantee our out-of-bounds reads are safe, but it + should at least catch the simple case of passing in an unpadded buffer.*/ + NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16, + "ScaleYCbCrToRGB565 source image unpadded?"); + /*The NEON code requires the pointers to be aligned to a 16-byte boundary at + the start of each row. + This should be true for all of our sources. + We could try to fix this up if it's not true by adjusting source_x0, but + that would require the mis-alignment to be the same for the U and V + planes.*/ + NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 && + ((y_buf-(uint8_t *)nullptr)&15) == 0 && + ((u_buf-(uint8_t *)nullptr)&15) == 0 && + ((v_buf-(uint8_t *)nullptr)&15) == 0, + "ScaleYCbCrToRGB565 source image unaligned"); + /*We take an area-based approach to pixel coverage to avoid shifting by small + amounts (or not so small, when up-scaling or down-scaling by a large + factor). + + An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^. + + + = RGB destination locations + * = Y' source locations + - = Cb, Cr source locations + + + + + + + + + + + * * * * + + + + + + + + + + - - + + + + + + + + + + * * * * + + + + + + + + + + + + + + + + + + + + * * * * + + + + + + + + + + - - + + + + + + + + + + * * * * + + + + + + + + + + + So, the coordinates of the upper-left + (first destination site) should + be (-0.25,-0.25) in the source Y' coordinate system. + Similarly, the coordinates should be (-0.375,-0.375) in the source Cb, Cr + coordinate system. + Note that the origin and scale of these two coordinate systems is not the + same! + + ^JPEG cositing is required for Theora; VP8 doesn't specify cositing rules, + but nearly all software converters in existence (at least those that are + open source, and many that are not) use JPEG cositing instead of MPEG.*/ + source_dx_q16 = (source_width<<16) / width; + source_x0_q16 = (source_x0<<16)+(source_dx_q16>>1)-0x8000; + source_dy_q16 = (source_height<<16) / height; + source_y0_q16 = (source_y0<<16)+(source_dy_q16>>1)-0x8000; + x_shift = (yuv_type != YV24); + y_shift = (yuv_type == YV12); + /*These two variables hold the difference between the origins of the Y' and + the Cb, Cr coordinate systems, using the scale of the Y' coordinate + system.*/ + source_uv_xoffs_q16 = -(x_shift<<15); + source_uv_yoffs_q16 = -(y_shift<<15); + /*Compute the range of source rows we'll actually use. + This doesn't guarantee we won't read outside this range.*/ + ymin = source_height >= 0 ? source_y0 : source_y0+source_height-1; + ymax = source_height >= 0 ? source_y0+source_height-1 : source_y0; + uvmin = ymin>>y_shift; + uvmax = ((ymax+1+y_shift)>>y_shift)-1; + /*Pick a dithering pattern. + The "&3" at the end is just in case RAND_MAX is lying.*/ + dither = (rand()/(RAND_MAX>>2))&3; + /*Nearest-neighbor scaling.*/ + if (filter == FILTER_NONE) { + yuv2rgb565_row_scale_nearest_ctx ctx; + yuv2rgb565_row_scale_nearest_func scale_row; + int y; + /*Add rounding offsets once, in advance.*/ + source_x0_q16 += 0x8000; + source_y0_q16 += 0x8000; + source_uv_xoffs_q16 += (x_shift<<15); + source_uv_yoffs_q16 += (y_shift<<15); + if (yuv_type == YV12) + scale_row = ScaleYCbCr42xToRGB565_Nearest_Row_C; + else + scale_row = ScaleYCbCr444ToRGB565_Nearest_Row_C; + ctx.width = width; + ctx.source_x0_q16 = source_x0_q16; + ctx.source_dx_q16 = source_dx_q16; + ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16; + for (y=0; y<height; y++) { + int source_y; + ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch); + source_y = source_y0_q16>>16; + source_y = clamped(source_y, ymin, ymax); + ctx.y_row = y_buf + source_y*y_pitch; + source_y = (source_y0_q16+source_uv_yoffs_q16)>>(16+y_shift); + source_y = clamped(source_y, uvmin, uvmax); + source_y0_q16 += source_dy_q16; + ctx.u_row = u_buf + source_y*uv_pitch; + ctx.v_row = v_buf + source_y*uv_pitch; + (*scale_row)(&ctx, dither); + dither ^= 2; + } + } + /*Bilinear scaling.*/ + else { + yuv2rgb565_row_scale_bilinear_ctx ctx; + yuv2rgb565_row_scale_bilinear_func scale_row; + int uvxscale_min; + int uvxscale_max; + int uvyscale_min; + int uvyscale_max; + int y; + /*Check how close the chroma scaling is to unity. + If it's close enough, we can get away with nearest-neighbor chroma + sub-sampling, and only doing bilinear on luma. + If a given axis is subsampled, we use bounds on the luma step of + [0.67...2], which is equivalent to scaling chroma by [1...3]. + If it's not subsampled, we use bounds of [0.5...1.33], which is + equivalent to scaling chroma by [0.75...2]. + The lower bound is chosen as a trade-off between speed and how terrible + nearest neighbor looks when upscaling.*/ +# define CHROMA_NEAREST_SUBSAMP_STEP_MIN 0xAAAA +# define CHROMA_NEAREST_NORMAL_STEP_MIN 0x8000 +# define CHROMA_NEAREST_SUBSAMP_STEP_MAX 0x20000 +# define CHROMA_NEAREST_NORMAL_STEP_MAX 0x15555 + uvxscale_min = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvxscale_max = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + uvyscale_min = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvyscale_max = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + if (uvxscale_min <= abs(source_dx_q16) + && abs(source_dx_q16) <= uvxscale_max + && uvyscale_min <= abs(source_dy_q16) + && abs(source_dy_q16) <= uvyscale_max) { + /*Add the rounding offsets now.*/ + source_uv_xoffs_q16 += 1<<(15+x_shift); + source_uv_yoffs_q16 += 1<<(15+y_shift); + if (yuv_type != YV24) { + scale_row = +//TODO: fix NEON asm for iOS +# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__) + supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON : +# endif + ScaleYCbCr42xToRGB565_BilinearY_Row_C; + } + else + scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C; + } + else { + if (yuv_type == YV12) + scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C; + else if (yuv_type == YV16) + scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C; + else + scale_row = ScaleYCbCr444ToRGB565_Bilinear_Row_C; + } + ctx.width = width; + ctx.y_pitch = y_pitch; + ctx.source_x0_q16 = source_x0_q16; + ctx.source_dx_q16 = source_dx_q16; + ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16; + ctx.uv_pitch = uv_pitch; + for (y=0; y<height; y++) { + int source_y; + int yweight; + int uvweight; + ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch); + source_y = (source_y0_q16+128)>>16; + yweight = ((source_y0_q16+128)>>8)&0xFF; + if (source_y < ymin) { + source_y = ymin; + yweight = 0; + } + if (source_y > ymax) { + source_y = ymax; + yweight = 0; + } + ctx.y_row = y_buf + source_y*y_pitch; + source_y = source_y0_q16+source_uv_yoffs_q16+(128<<y_shift); + source_y0_q16 += source_dy_q16; + uvweight = source_y>>(8+y_shift)&0xFF; + source_y >>= 16+y_shift; + if (source_y < uvmin) { + source_y = uvmin; + uvweight = 0; + } + if (source_y > uvmax) { + source_y = uvmax; + uvweight = 0; + } + ctx.u_row = u_buf + source_y*uv_pitch; + ctx.v_row = v_buf + source_y*uv_pitch; + ctx.y_yweight = yweight; + ctx.uv_yweight = uvweight; + (*scale_row)(&ctx, dither); + dither ^= 2; + } + } +} + +bool IsScaleYCbCrToRGB565Fast(int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + YUVType yuv_type, + ScaleFilter filter) +{ + // Very fast. + if (width <= 0 || height <= 0) + return true; +# if defined(MOZILLA_MAY_SUPPORT_NEON) + if (filter != FILTER_NONE) { + int source_dx_q16; + int source_dy_q16; + int uvxscale_min; + int uvxscale_max; + int uvyscale_min; + int uvyscale_max; + source_dx_q16 = (source_width<<16) / width; + source_dy_q16 = (source_height<<16) / height; + uvxscale_min = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvxscale_max = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + uvyscale_min = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvyscale_max = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + if (uvxscale_min <= abs(source_dx_q16) + && abs(source_dx_q16) <= uvxscale_max + && uvyscale_min <= abs(source_dy_q16) + && abs(source_dy_q16) <= uvyscale_max) { + if (yuv_type != YV24) + return supports_neon(); + } + } +# endif + return false; +} + + + +void yuv_to_rgb565_row_c(uint16_t *dst, + const uint8_t *y, + const uint8_t *u, + const uint8_t *v, + int x_shift, + int pic_x, + int pic_width) +{ + int x; + for (x = 0; x < pic_width; x++) + { + dst[x] = yu2rgb565(y[pic_x+x], + u[(pic_x+x)>>x_shift], + v[(pic_x+x)>>x_shift], + 2); // Disable dithering for now. + } +} + +void ConvertYCbCrToRGB565(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type) +{ + int x_shift; + int y_shift; + x_shift = yuv_type != YV24; + y_shift = yuv_type == YV12; +//TODO: fix NEON asm for iOS +# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__) + if (yuv_type != YV24 && supports_neon()) + { + for (int i = 0; i < pic_height; i++) { + int yoffs; + int uvoffs; + yoffs = y_pitch * (pic_y+i) + pic_x; + uvoffs = uv_pitch * ((pic_y+i)>>y_shift) + (pic_x>>x_shift); + yuv42x_to_rgb565_row_neon((uint16_t*)(rgb_buf + rgb_pitch * i), + y_buf + yoffs, + u_buf + uvoffs, + v_buf + uvoffs, + pic_width, + pic_x&x_shift); + } + } + else +# endif + { + for (int i = 0; i < pic_height; i++) { + int yoffs; + int uvoffs; + yoffs = y_pitch * (pic_y+i); + uvoffs = uv_pitch * ((pic_y+i)>>y_shift); + yuv_to_rgb565_row_c((uint16_t*)(rgb_buf + rgb_pitch * i), + y_buf + yoffs, + u_buf + uvoffs, + v_buf + uvoffs, + x_shift, + pic_x, + pic_width); + } + } +} + +bool IsConvertYCbCrToRGB565Fast(int pic_x, + int pic_y, + int pic_width, + int pic_height, + YUVType yuv_type) +{ +# if defined(MOZILLA_MAY_SUPPORT_NEON) + return (yuv_type != YV24 && supports_neon()); +# else + return false; +# endif +} + +} // namespace gfx + +} // namespace mozilla + +#endif // HAVE_YCBCR_TO_RGB565 diff --git a/gfx/ycbcr/ycbcr_to_rgb565.h b/gfx/ycbcr/ycbcr_to_rgb565.h new file mode 100644 index 0000000000..7793962b5c --- /dev/null +++ b/gfx/ycbcr/ycbcr_to_rgb565.h @@ -0,0 +1,72 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#ifndef MEDIA_BASE_YCBCR_TO_RGB565_H_ +#define MEDIA_BASE_YCBCR_TO_RGB565_H_ +#include "yuv_convert.h" +#include "mozilla/arm.h" + +// It's currently only worth including this if we have NEON support. +#if defined(__arm__) && defined(MOZILLA_MAY_SUPPORT_NEON) +#define HAVE_YCBCR_TO_RGB565 1 +#endif + +namespace mozilla { + +namespace gfx { + +#ifdef HAVE_YCBCR_TO_RGB565 +// Convert a frame of YUV to 16 bit RGB565. +void ConvertYCbCrToRGB565(const uint8_t* yplane, + const uint8_t* uplane, + const uint8_t* vplane, + uint8_t* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +// Used to test if we have an accelerated version. +bool IsConvertYCbCrToRGB565Fast(int pic_x, + int pic_y, + int pic_width, + int pic_height, + YUVType yuv_type); + +// Scale a frame of YUV to 16 bit RGB565. +void ScaleYCbCrToRGB565(const uint8_t *yplane, + const uint8_t *uplane, + const uint8_t *vplane, + uint8_t *rgbframe, + int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + ScaleFilter filter); + +// Used to test if we have an accelerated version. +bool IsScaleYCbCrToRGB565Fast(int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + YUVType yuv_type, + ScaleFilter filter); +#endif // HAVE_YCBCR_TO_RGB565 + +} // namespace gfx + +} // namespace mozilla + +#endif // MEDIA_BASE_YCBCR_TO_RGB565_H_ diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp new file mode 100644 index 0000000000..f7d01a3ef8 --- /dev/null +++ b/gfx/ycbcr/yuv_convert.cpp @@ -0,0 +1,577 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This webpage shows layout of YV12 and other YUV formats +// http://www.fourcc.org/yuv.php +// The actual conversion is best described here +// http://en.wikipedia.org/wiki/YUV +// An article on optimizing YUV conversion using tables instead of multiplies +// http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf +// +// YV12 is a full plane of Y and a half height, half width chroma planes +// YV16 is a full plane of Y and a full height, half width chroma planes +// YV24 is a full plane of Y and a full height, full width chroma planes +// Y8 is a full plane of Y and no chroma planes (i.e., monochrome) +// +// ARGB pixel format is output, which on little endian is stored as BGRA. +// The alpha is set to 255, allowing the application to use RGBA or RGB32. + +#include "yuv_convert.h" + +#include "mozilla/StaticPrefs_gfx.h" +#include "libyuv.h" +#include "scale_yuv_argb.h" +// Header for low level row functions. +#include "yuv_row.h" +#include "mozilla/SSE.h" +#include "mozilla/IntegerRange.h" + +namespace mozilla { + +namespace gfx { + +// 16.16 fixed point arithmetic +const int kFractionBits = 16; +const int kFractionMax = 1 << kFractionBits; +const int kFractionMask = ((1 << kFractionBits) - 1); + +// clang-format off + +libyuv::FourCC FourCCFromYUVType(YUVType aYUVType) { + switch (aYUVType) { + case YV24: return libyuv::FOURCC_I444; + case YV16: return libyuv::FOURCC_I422; + case YV12: return libyuv::FOURCC_I420; + case Y8: return libyuv::FOURCC_I400; + default: return libyuv::FOURCC_ANY; + } +} + +int GBRPlanarToARGB(const uint8_t* src_y, int y_pitch, + const uint8_t* src_u, int u_pitch, + const uint8_t* src_v, int v_pitch, + uint8_t* rgb_buf, int rgb_pitch, + int pic_width, int pic_height) { + // libyuv has no native conversion function for this + // fixme: replace with something less awful + for (const auto row : IntegerRange(pic_height)) { + for (const auto col : IntegerRange(pic_width)) { + rgb_buf[rgb_pitch * row + col * 4 + 0] = src_u[u_pitch * row + col]; + rgb_buf[rgb_pitch * row + col * 4 + 1] = src_y[y_pitch * row + col]; + rgb_buf[rgb_pitch * row + col * 4 + 2] = src_v[v_pitch * row + col]; + rgb_buf[rgb_pitch * row + col * 4 + 3] = 255; + } + } + return 0; +} + +// Convert a frame of YUV to 32 bit ARGB. +void ConvertYCbCrToRGB32(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, int pic_x, + int pic_y, int pic_width, int pic_height, int y_pitch, + int uv_pitch, int rgb_pitch, YUVType yuv_type, + YUVColorSpace yuv_color_space, + ColorRange color_range) { + // Deprecated function's conversion is accurate. + // libyuv converion is a bit inaccurate to get performance. It dynamically + // calculates RGB from YUV to use simd. In it, signed byte is used for + // conversion's coefficient, but it requests 129. libyuv cut 129 to 127. And + // only 6 bits are used for a decimal part during the dynamic calculation. + // + // The function is still fast on some old intel chips. + // See Bug 1256475. + bool use_deprecated = StaticPrefs::gfx_ycbcr_accurate_conversion() || + (supports_mmx() && supports_sse() && !supports_sse3() && + yuv_color_space == YUVColorSpace::BT601 && + color_range == ColorRange::LIMITED); + // The deprecated function only support BT601. + // See Bug 1210357. + if (yuv_color_space != YUVColorSpace::BT601) { + use_deprecated = false; + } + if (use_deprecated) { + ConvertYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, rgb_buf, pic_x, pic_y, + pic_width, pic_height, y_pitch, uv_pitch, + rgb_pitch, yuv_type); + return; + } + + decltype(libyuv::I420ToARGBMatrix)* fConvertYUVToARGB = nullptr; + const uint8_t* src_y = nullptr; + const uint8_t* src_u = nullptr; + const uint8_t* src_v = nullptr; + const libyuv::YuvConstants* yuv_constant = nullptr; + + switch (yuv_color_space) { + case YUVColorSpace::BT2020: + yuv_constant = color_range == ColorRange::LIMITED + ? &libyuv::kYuv2020Constants + : &libyuv::kYuvV2020Constants; + break; + case YUVColorSpace::BT709: + yuv_constant = color_range == ColorRange::LIMITED + ? &libyuv::kYuvH709Constants + : &libyuv::kYuvF709Constants; + break; + case YUVColorSpace::Identity: + MOZ_ASSERT(yuv_type == YV24, "Identity (aka RGB) with chroma subsampling is unsupported"); + if (yuv_type == YV24) { + break; + } + [[fallthrough]]; // Assuming BT601 for unsupported input is better than crashing + default: + MOZ_FALLTHROUGH_ASSERT("Unsupported YUVColorSpace"); + case YUVColorSpace::BT601: + yuv_constant = color_range == ColorRange::LIMITED + ? &libyuv::kYuvI601Constants + : &libyuv::kYuvJPEGConstants; + break; + } + + switch (yuv_type) { + case YV24: { + src_y = y_buf + y_pitch * pic_y + pic_x; + src_u = u_buf + uv_pitch * pic_y + pic_x; + src_v = v_buf + uv_pitch * pic_y + pic_x; + + if (yuv_color_space == YUVColorSpace::Identity) { + // Special case for RGB image + DebugOnly<int> err = + GBRPlanarToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch, + rgb_buf, rgb_pitch, pic_width, pic_height); + MOZ_ASSERT(!err); + return; + } + + fConvertYUVToARGB = libyuv::I444ToARGBMatrix; + break; + } + case YV16: { + src_y = y_buf + y_pitch * pic_y + pic_x; + src_u = u_buf + uv_pitch * pic_y + pic_x / 2; + src_v = v_buf + uv_pitch * pic_y + pic_x / 2; + + fConvertYUVToARGB = libyuv::I422ToARGBMatrix; + break; + } + case YV12: { + src_y = y_buf + y_pitch * pic_y + pic_x; + src_u = u_buf + (uv_pitch * pic_y + pic_x) / 2; + src_v = v_buf + (uv_pitch * pic_y + pic_x) / 2; + + fConvertYUVToARGB = libyuv::I420ToARGBMatrix; + break; + } + case Y8: { + src_y = y_buf + y_pitch * pic_y + pic_x; + MOZ_ASSERT(u_buf == nullptr); + MOZ_ASSERT(v_buf == nullptr); + + if (color_range == ColorRange::LIMITED) { + DebugOnly<int> err = + libyuv::I400ToARGB(src_y, y_pitch, rgb_buf, rgb_pitch, pic_width, + pic_height); + MOZ_ASSERT(!err); + } else { + DebugOnly<int> err = + libyuv::J400ToARGB(src_y, y_pitch, rgb_buf, rgb_pitch, pic_width, + pic_height); + MOZ_ASSERT(!err); + } + + return; + } + default: + MOZ_ASSERT_UNREACHABLE("Unsupported YUV type"); + } + + DebugOnly<int> err = + fConvertYUVToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch, + rgb_buf, rgb_pitch, yuv_constant, pic_width, pic_height); + MOZ_ASSERT(!err); +} + +// Convert a frame of YUV to 32 bit ARGB. +void ConvertYCbCrToRGB32_deprecated(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type) { + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + unsigned int x_shift = yuv_type == YV24 ? 0 : 1; + // Test for SSE because the optimized code uses movntq, which is not part of MMX. + bool has_sse = supports_mmx() && supports_sse(); + // There is no optimized YV24 SSE routine so we check for this and + // fall back to the C code. + has_sse &= yuv_type != YV24; + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; + int x_width = odd_pic_x ? pic_width - 1 : pic_width; + + for (int y = pic_y; y < pic_height + pic_y; ++y) { + uint8_t* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; + const uint8_t* y_ptr = y_buf + y * y_pitch + pic_x; + const uint8_t* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); + const uint8_t* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); + + if (odd_pic_x) { + // Handle the single odd pixel manually and use the + // fast routines for the remaining. + FastConvertYUVToRGB32Row_C(y_ptr++, + u_ptr++, + v_ptr++, + rgb_row, + 1, + x_shift); + rgb_row += 4; + } + + if (has_sse) { + FastConvertYUVToRGB32Row(y_ptr, + u_ptr, + v_ptr, + rgb_row, + x_width); + } + else { + FastConvertYUVToRGB32Row_C(y_ptr, + u_ptr, + v_ptr, + rgb_row, + x_width, + x_shift); + } + } + + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. + if (has_sse) + EMMS(); +} + +// C version does 8 at a time to mimic MMX code +static void FilterRows_C(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr, + int source_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + uint8_t* end = ybuf + source_width; + do { + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; + ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8; + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; + } while (ybuf < end); +} + +#ifdef MOZILLA_MAY_SUPPORT_MMX +void FilterRows_MMX(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr, + int source_width, int source_y_fraction); +#endif + +#ifdef MOZILLA_MAY_SUPPORT_SSE2 +void FilterRows_SSE2(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr, + int source_width, int source_y_fraction); +#endif + +static inline void FilterRows(uint8_t* ybuf, const uint8_t* y0_ptr, + const uint8_t* y1_ptr, int source_width, + int source_y_fraction) { +#ifdef MOZILLA_MAY_SUPPORT_SSE2 + if (mozilla::supports_sse2()) { + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + return; + } +#endif + +#ifdef MOZILLA_MAY_SUPPORT_MMX + if (mozilla::supports_mmx()) { + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + return; + } +#endif + + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); +} + + +// Scale a frame of YUV to 32 bit ARGB. +void ScaleYCbCrToRGB32(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int source_width, + int source_height, + int width, + int height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + YUVColorSpace yuv_color_space, + ScaleFilter filter) { + bool use_deprecated = + StaticPrefs::gfx_ycbcr_accurate_conversion() || +#if defined(XP_WIN) && defined(_M_X64) + // libyuv does not support SIMD scaling on win 64bit. See Bug 1295927. + supports_sse3() || +#endif + (supports_mmx() && supports_sse() && !supports_sse3()); + // The deprecated function only support BT601. + // See Bug 1210357. + if (yuv_color_space != YUVColorSpace::BT601) { + use_deprecated = false; + } + if (use_deprecated) { + ScaleYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, + rgb_buf, + source_width, source_height, + width, height, + y_pitch, uv_pitch, + rgb_pitch, + yuv_type, + ROTATE_0, + filter); + return; + } + + DebugOnly<int> err = + libyuv::YUVToARGBScale(y_buf, y_pitch, + u_buf, uv_pitch, + v_buf, uv_pitch, + FourCCFromYUVType(yuv_type), + yuv_color_space, + source_width, source_height, + rgb_buf, rgb_pitch, + width, height, + libyuv::kFilterBilinear); + MOZ_ASSERT(!err); + return; +} + +// Scale a frame of YUV to 32 bit ARGB. +void ScaleYCbCrToRGB32_deprecated(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int source_width, + int source_height, + int width, + int height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + Rotate view_rotate, + ScaleFilter filter) { + bool has_mmx = supports_mmx(); + + // 4096 allows 3 buffers to fit in 12k. + // Helps performance on CPU with 16K L1 cache. + // Large enough for 3830x2160 and 30" displays which are 2560x1600. + const int kFilterBufferSize = 4096; + // Disable filtering if the screen is too big (to avoid buffer overflows). + // This should never happen to regular users: they don't have monitors + // wider than 4096 pixels. + // TODO(fbarchard): Allow rotated videos to filter. + if (source_width > kFilterBufferSize || view_rotate) + filter = FILTER_NONE; + + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + // Diagram showing origin and direction of source sampling. + // ->0 4<- + // 7 3 + // + // 6 5 + // ->1 2<- + // Rotations that start at right side of image. + if ((view_rotate == ROTATE_180) || + (view_rotate == ROTATE_270) || + (view_rotate == MIRROR_ROTATE_0) || + (view_rotate == MIRROR_ROTATE_90)) { + y_buf += source_width - 1; + u_buf += source_width / 2 - 1; + v_buf += source_width / 2 - 1; + source_width = -source_width; + } + // Rotations that start at bottom of image. + if ((view_rotate == ROTATE_90) || + (view_rotate == ROTATE_180) || + (view_rotate == MIRROR_ROTATE_90) || + (view_rotate == MIRROR_ROTATE_180)) { + y_buf += (source_height - 1) * y_pitch; + u_buf += ((source_height >> y_shift) - 1) * uv_pitch; + v_buf += ((source_height >> y_shift) - 1) * uv_pitch; + source_height = -source_height; + } + + // Handle zero sized destination. + if (width == 0 || height == 0) + return; + int source_dx = source_width * kFractionMax / width; + int source_dy = source_height * kFractionMax / height; + int source_dx_uv = source_dx; + + if ((view_rotate == ROTATE_90) || + (view_rotate == ROTATE_270)) { + int tmp = height; + height = width; + width = tmp; + tmp = source_height; + source_height = source_width; + source_width = tmp; + int original_dx = source_dx; + int original_dy = source_dy; + source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits; + source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits; + source_dy = original_dx; + if (view_rotate == ROTATE_90) { + y_pitch = -1; + uv_pitch = -1; + source_height = -source_height; + } else { + y_pitch = 1; + uv_pitch = 1; + } + } + + // Need padding because FilterRows() will write 1 to 16 extra pixels + // after the end for SSE2 version. + uint8_t yuvbuf[16 + kFilterBufferSize * 3 + 16]; + uint8_t* ybuf = + reinterpret_cast<uint8_t*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15); + uint8_t* ubuf = ybuf + kFilterBufferSize; + uint8_t* vbuf = ubuf + kFilterBufferSize; + // TODO(fbarchard): Fixed point math is off by 1 on negatives. + int yscale_fixed = (source_height << kFractionBits) / height; + + // TODO(fbarchard): Split this into separate function for better efficiency. + for (int y = 0; y < height; ++y) { + uint8_t* dest_pixel = rgb_buf + y * rgb_pitch; + int source_y_subpixel = (y * yscale_fixed); + if (yscale_fixed >= (kFractionMax * 2)) { + source_y_subpixel += kFractionMax / 2; // For 1/2 or less, center filter. + } + int source_y = source_y_subpixel >> kFractionBits; + + const uint8_t* y0_ptr = y_buf + source_y * y_pitch; + const uint8_t* y1_ptr = y0_ptr + y_pitch; + + const uint8_t* u0_ptr = u_buf + (source_y >> y_shift) * uv_pitch; + const uint8_t* u1_ptr = u0_ptr + uv_pitch; + const uint8_t* v0_ptr = v_buf + (source_y >> y_shift) * uv_pitch; + const uint8_t* v1_ptr = v0_ptr + uv_pitch; + + // vertical scaler uses 16.8 fixed point + int source_y_fraction = (source_y_subpixel & kFractionMask) >> 8; + int source_uv_fraction = + ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; + + const uint8_t* y_ptr = y0_ptr; + const uint8_t* u_ptr = u0_ptr; + const uint8_t* v_ptr = v0_ptr; + // Apply vertical filtering if necessary. + // TODO(fbarchard): Remove memcpy when not necessary. + if (filter & mozilla::gfx::FILTER_BILINEAR_V) { + if (yscale_fixed != kFractionMax && + source_y_fraction && ((source_y + 1) < source_height)) { + FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + } else { + memcpy(ybuf, y0_ptr, source_width); + } + y_ptr = ybuf; + ybuf[source_width] = ybuf[source_width-1]; + int uv_source_width = (source_width + 1) / 2; + if (yscale_fixed != kFractionMax && + source_uv_fraction && + (((source_y >> y_shift) + 1) < (source_height >> y_shift))) { + FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction); + FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction); + } else { + memcpy(ubuf, u0_ptr, uv_source_width); + memcpy(vbuf, v0_ptr, uv_source_width); + } + u_ptr = ubuf; + v_ptr = vbuf; + ubuf[uv_source_width] = ubuf[uv_source_width - 1]; + vbuf[uv_source_width] = vbuf[uv_source_width - 1]; + } + if (source_dx == kFractionMax) { // Not scaled + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width); + } else if (filter & FILTER_BILINEAR_H) { + LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { +// Specialized scalers and rotation. +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) && !defined(__clang__) + if(mozilla::supports_sse()) { + if (width == (source_width * 2)) { + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, + dest_pixel, width); + } else if ((source_dx & kFractionMask) == 0) { + // Scaling by integer scale factor. ie half. + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, + dest_pixel, width, + source_dx >> kFractionBits); + } else if (source_dx_uv == source_dx) { // Not rotated. + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, + dest_pixel, width, + source_dx >> kFractionBits, + source_dx_uv >> kFractionBits); + } + } + else { + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } +#else + (void)source_dx_uv; + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); +#endif + } + } + // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. + if (has_mmx) + EMMS(); +} +void ConvertI420AlphaToARGB32(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* argb_buf, + int pic_width, + int pic_height, + int ya_pitch, + int uv_pitch, + int argb_pitch) { + + // The downstream graphics stack expects an attenuated input, hence why the + // attenuation parameter is set. + DebugOnly<int> err = libyuv::I420AlphaToARGB(y_buf, ya_pitch, + u_buf, uv_pitch, + v_buf, uv_pitch, + a_buf, ya_pitch, + argb_buf, argb_pitch, + pic_width, pic_height, 1); + MOZ_ASSERT(!err); +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h new file mode 100644 index 0000000000..3368890819 --- /dev/null +++ b/gfx/ycbcr/yuv_convert.h @@ -0,0 +1,123 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// clang-format off + +#ifndef MEDIA_BASE_YUV_CONVERT_H_ +#define MEDIA_BASE_YUV_CONVERT_H_ + +#include "chromium_types.h" +#include "mozilla/gfx/Types.h" + +namespace mozilla { + +namespace gfx { + +// Type of YUV surface. +// The value of these enums matter as they are used to shift vertical indices. +enum YUVType { + YV12 = 0, // YV12 is half width and half height chroma channels. + YV16 = 1, // YV16 is half width and full height chroma channels. + YV24 = 2, // YV24 is full width and full height chroma channels. + Y8 = 3 // Y8 is monochrome: no chroma channels. +}; + +// Mirror means flip the image horizontally, as in looking in a mirror. +// Rotate happens after mirroring. +enum Rotate { + ROTATE_0, // Rotation off. + ROTATE_90, // Rotate clockwise. + ROTATE_180, // Rotate upside down. + ROTATE_270, // Rotate counter clockwise. + MIRROR_ROTATE_0, // Mirror horizontally. + MIRROR_ROTATE_90, // Mirror then Rotate clockwise. + MIRROR_ROTATE_180, // Mirror vertically. + MIRROR_ROTATE_270 // Transpose. +}; + +// Filter affects how scaling looks. +enum ScaleFilter { + FILTER_NONE = 0, // No filter (point sampled). + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. + FILTER_BILINEAR = 3 // Bilinear filter. +}; + +// Convert a frame of YUV to 32 bit ARGB. +// Pass in YV16/YV12 depending on source format +void ConvertYCbCrToRGB32(const uint8_t* yplane, + const uint8_t* uplane, + const uint8_t* vplane, + uint8_t* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + YUVColorSpace yuv_color_space, + ColorRange color_range); + +void ConvertYCbCrToRGB32_deprecated(const uint8_t* yplane, + const uint8_t* uplane, + const uint8_t* vplane, + uint8_t* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +// Scale a frame of YUV to 32 bit ARGB. +// Supports rotation and mirroring. +void ScaleYCbCrToRGB32(const uint8_t* yplane, + const uint8_t* uplane, + const uint8_t* vplane, + uint8_t* rgbframe, + int source_width, + int source_height, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + YUVColorSpace yuv_color_space, + ScaleFilter filter); + +void ScaleYCbCrToRGB32_deprecated(const uint8_t* yplane, + const uint8_t* uplane, + const uint8_t* vplane, + uint8_t* rgbframe, + int source_width, + int source_height, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + Rotate view_rotate, + ScaleFilter filter); + +void ConvertI420AlphaToARGB32(const uint8_t* yplane, + const uint8_t* uplane, + const uint8_t* vplane, + const uint8_t* aplane, + uint8_t* argbframe, + int pic_width, + int pic_height, + int yastride, + int uvstride, + int argbstride); + +} // namespace gfx +} // namespace mozilla + +#endif // MEDIA_BASE_YUV_CONVERT_H_ diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp new file mode 100644 index 0000000000..7d58fa475d --- /dev/null +++ b/gfx/ycbcr/yuv_convert_arm.cpp @@ -0,0 +1,232 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com> + +#include "yuv_convert.h" +#include "ycbcr_to_rgb565.h" + + + +#ifdef HAVE_YCBCR_TO_RGB565 + +namespace mozilla { + +namespace gfx { + +# if defined(MOZILLA_MAY_SUPPORT_NEON) +# if defined(__clang__) +void __attribute((noinline)) +# else +void __attribute((noinline,optimize("-fomit-frame-pointer"))) +# endif + yuv42x_to_rgb565_row_neon(uint16_t *dst, + const uint8_t *y, + const uint8_t *u, + const uint8_t *v, + int n, + int oddflag) +{ + static __attribute__((aligned(16))) uint16_t acc_r[8] = { + 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840, + }; + static __attribute__((aligned(16))) uint16_t acc_g[8] = { + 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312, + }; + static __attribute__((aligned(16))) uint16_t acc_b[8] = { + 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832, + }; + /* + * Registers: + * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data + * q2 : d4, d5 - are used for storing converted RGB data + * q3 : d6, d7 - are used for temporary storage + * + * q4-q7 - reserved + * + * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data + * q10 : d20, d21 + * q11 : d22, d23 + * q12 : d24, d25 + * q13 : d26, d27 + * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154) + */ + asm volatile ( +".fpu neon\n" +/* Allow to build on targets not supporting neon, and force the object file + * target to avoid bumping the final binary target */ +".arch armv7-a\n" +".object_arch armv4t\n" +".macro convert_macroblock size\n" +/* load up to 16 source pixels */ + ".if \\size == 16\n" + "pld [%[y], #64]\n" + "pld [%[u], #64]\n" + "pld [%[v], #64]\n" + "vld1.8 {d1}, [%[y]]!\n" + "vld1.8 {d3}, [%[y]]!\n" + "vld1.8 {d0}, [%[u]]!\n" + "vld1.8 {d2}, [%[v]]!\n" + ".elseif \\size == 8\n" + "vld1.8 {d1}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d0[1]}, [%[u]]!\n" + "vld1.8 {d0[2]}, [%[u]]!\n" + "vld1.8 {d0[3]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + "vld1.8 {d2[1]}, [%[v]]!\n" + "vld1.8 {d2[2]}, [%[v]]!\n" + "vld1.8 {d2[3]}, [%[v]]!\n" + ".elseif \\size == 4\n" + "vld1.8 {d1[0]}, [%[y]]!\n" + "vld1.8 {d1[1]}, [%[y]]!\n" + "vld1.8 {d1[2]}, [%[y]]!\n" + "vld1.8 {d1[3]}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d0[1]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + "vld1.8 {d2[1]}, [%[v]]!\n" + ".elseif \\size == 2\n" + "vld1.8 {d1[0]}, [%[y]]!\n" + "vld1.8 {d1[1]}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + ".elseif \\size == 1\n" + "vld1.8 {d1[0]}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + ".else\n" + ".error \"unsupported macroblock size\"\n" + ".endif\n" + + /* d1 - Y data (first 8 bytes) */ + /* d3 - Y data (next 8 bytes) */ + /* d0 - U data, d2 - V data */ + + /* split even and odd Y color components */ + "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */ + /* clip upper and lower boundaries */ + "vqadd.u8 q0, q0, q4\n" + "vqadd.u8 q1, q1, q4\n" + "vqsub.u8 q0, q0, q5\n" + "vqsub.u8 q1, q1, q5\n" + + "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */ + + "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */ + "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */ + + "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */ + "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */ + "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */ + "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */ + "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */ + "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */ + "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */ + "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */ + "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */ + + "vhsub.s16 q3, q8, q10\n" /* calculate even red components */ + "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */ + "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */ + "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */ + + "vhadd.s16 q3, q8, q11\n" /* calculate even green components */ + "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */ + "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */ + "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */ + + "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */ + "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */ + "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */ + "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */ + + "vzip.8 d0, d3\n" /* join even and odd red components */ + "vzip.8 d1, d4\n" /* join even and odd green components */ + "vzip.8 d2, d5\n" /* join even and odd blue components */ + + "vshll.u8 q3, d0, #8\n\t" + "vshll.u8 q8, d1, #8\n\t" + "vshll.u8 q9, d2, #8\n\t" + "vsri.u16 q3, q8, #5\t\n" + "vsri.u16 q3, q9, #11\t\n" + /* store pixel data to memory */ + ".if \\size == 16\n" + " vst1.16 {d6, d7}, [%[dst]]!\n" + " vshll.u8 q3, d3, #8\n\t" + " vshll.u8 q8, d4, #8\n\t" + " vshll.u8 q9, d5, #8\n\t" + " vsri.u16 q3, q8, #5\t\n" + " vsri.u16 q3, q9, #11\t\n" + " vst1.16 {d6, d7}, [%[dst]]!\n" + ".elseif \\size == 8\n" + " vst1.16 {d6, d7}, [%[dst]]!\n" + ".elseif \\size == 4\n" + " vst1.16 {d6}, [%[dst]]!\n" + ".elseif \\size == 2\n" + " vst1.16 {d6[0]}, [%[dst]]!\n" + " vst1.16 {d6[1]}, [%[dst]]!\n" + ".elseif \\size == 1\n" + " vst1.16 {d6[0]}, [%[dst]]!\n" + ".endif\n" + ".endm\n" + + "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */ + "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */ + "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */ + "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */ + + "vmov.u8 d26, #16\n" + "vmov.u8 d27, #149\n" + "vmov.u8 d28, #204\n" + "vmov.u8 d29, #50\n" + "vmov.u8 d30, #104\n" + "vmov.u8 d31, #154\n" + + "cmp %[oddflag], #0\n" + "beq 1f\n" + "convert_macroblock 1\n" + "sub %[n], %[n], #1\n" + "1:\n" + "subs %[n], %[n], #16\n" + "blt 2f\n" + "1:\n" + "convert_macroblock 16\n" + "subs %[n], %[n], #16\n" + "bge 1b\n" + "2:\n" + "tst %[n], #8\n" + "beq 3f\n" + "convert_macroblock 8\n" + "3:\n" + "tst %[n], #4\n" + "beq 4f\n" + "convert_macroblock 4\n" + "4:\n" + "tst %[n], #2\n" + "beq 5f\n" + "convert_macroblock 2\n" + "5:\n" + "tst %[n], #1\n" + "beq 6f\n" + "convert_macroblock 1\n" + "6:\n" + ".purgem convert_macroblock\n" + : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n) + : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]), + [oddflag] "r" (oddflag) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */ + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" + ); +} +# endif // MOZILLA_MAY_SUPPORT_NEON + +} // namespace gfx + +} // namespace mozilla + +#endif // HAVE_YCBCR_TO_RGB565 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp new file mode 100644 index 0000000000..797b032f79 --- /dev/null +++ b/gfx/ycbcr/yuv_convert_mmx.cpp @@ -0,0 +1,45 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <mmintrin.h> +#include "yuv_row.h" + +namespace mozilla { +namespace gfx { + +// FilterRows combines two rows of the image using linear interpolation. +// MMX version does 8 pixels at a time. +void FilterRows_MMX(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr, + int source_width, int source_y_fraction) { + __m64 zero = _mm_setzero_si64(); + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); + + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); + __m64* dest64 = reinterpret_cast<__m64*>(ybuf); + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); + + do { + __m64 y0 = *y0_ptr64++; + __m64 y1 = *y1_ptr64++; + __m64 y2 = _mm_unpackhi_pi8(y0, zero); + __m64 y3 = _mm_unpackhi_pi8(y1, zero); + y0 = _mm_unpacklo_pi8(y0, zero); + y1 = _mm_unpacklo_pi8(y1, zero); + y0 = _mm_mullo_pi16(y0, y0_fraction); + y1 = _mm_mullo_pi16(y1, y1_fraction); + y2 = _mm_mullo_pi16(y2, y0_fraction); + y3 = _mm_mullo_pi16(y3, y1_fraction); + y0 = _mm_add_pi16(y0, y1); + y2 = _mm_add_pi16(y2, y3); + y0 = _mm_srli_pi16(y0, 8); + y2 = _mm_srli_pi16(y2, 8); + y0 = _mm_packs_pu16(y0, y2); + *dest64++ = y0; + } while (dest64 < end64); +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp new file mode 100644 index 0000000000..b5a84c908d --- /dev/null +++ b/gfx/ycbcr/yuv_convert_sse2.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <emmintrin.h> +#include "yuv_row.h" + +namespace mozilla { +namespace gfx { + +// FilterRows combines two rows of the image using linear interpolation. +// SSE2 version does 16 pixels at a time. +void FilterRows_SSE2(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr, + int source_width, int source_y_fraction) { + __m128i zero = _mm_setzero_si128(); + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); + + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); + + do { + __m128i y0 = _mm_loadu_si128(y0_ptr128); + __m128i y1 = _mm_loadu_si128(y1_ptr128); + __m128i y2 = _mm_unpackhi_epi8(y0, zero); + __m128i y3 = _mm_unpackhi_epi8(y1, zero); + y0 = _mm_unpacklo_epi8(y0, zero); + y1 = _mm_unpacklo_epi8(y1, zero); + y0 = _mm_mullo_epi16(y0, y0_fraction); + y1 = _mm_mullo_epi16(y1, y1_fraction); + y2 = _mm_mullo_epi16(y2, y0_fraction); + y3 = _mm_mullo_epi16(y3, y1_fraction); + y0 = _mm_add_epi16(y0, y1); + y2 = _mm_add_epi16(y2, y3); + y0 = _mm_srli_epi16(y0, 8); + y2 = _mm_srli_epi16(y2, 8); + y0 = _mm_packus_epi16(y0, y2); + *dest128++ = y0; + ++y0_ptr128; + ++y1_ptr128; + } while (dest128 < end128); +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h new file mode 100644 index 0000000000..95ad16fb71 --- /dev/null +++ b/gfx/ycbcr/yuv_row.h @@ -0,0 +1,154 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// yuv_row internal functions to handle YUV conversion and scaling to RGB. +// These functions are used from both yuv_convert.cc and yuv_scale.cc. + +// TODO(fbarchard): Write function that can handle rotation and scaling. + +#ifndef MEDIA_BASE_YUV_ROW_H_ +#define MEDIA_BASE_YUV_ROW_H_ + +#include "chromium_types.h" + +extern "C" { +// Can only do 1x. +// This is the second fastest of the scalers. +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width); + +void FastConvertYUVToRGB32Row_C(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + unsigned int x_shift); + +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width); + +// Can do 1x, half size or any scale down by an integer amount. +// Step can be negative (mirroring, rotate 180). +// This is the third fastest of the scalers. +// Only defined on Windows x86-32. +void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int step); + +// Rotate is like Convert, but applies different step to Y versus U and V. +// This allows rotation by 90 or 270, by stepping by stride. +// This is the forth fastest of the scalers. +// Only defined on Windows x86-32. +void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int ystep, + int uvstep); + +// Doubler does 4 pixels at a time. Each pixel is replicated. +// This is the fastest of the scalers. +// Only defined on Windows x86-32. +void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width); + +// Handles arbitrary scaling up or down. +// Mirroring is supported, but not 90 or 270 degree rotation. +// Chroma is under sampled every 2 pixels for performance. +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_C(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + +// Handles arbitrary scaling up or down with bilinear filtering. +// Mirroring is supported, but not 90 or 270 degree rotation. +// Chroma is under sampled every 2 pixels for performance. +// This is the slowest of the scalers. +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_C(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + + +#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) +#if defined(VISUALC_HAS_AVX2) +#define SIMD_ALIGNED(var) __declspec(align(32)) var +#else +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#endif +#elif defined(__GNUC__) || defined(__clang__) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) +#define SIMD_ALIGNED(var) var __attribute__((aligned(32))) +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif +#else +#define SIMD_ALIGNED(var) var +#endif + +extern SIMD_ALIGNED(const int16_t kCoefficientsRgbY[768][4]); + +// x64 uses MMX2 (SSE) so emms is not required. +// Warning C4799: function has no EMMS instruction. +// EMMS() is slow and should be called by the calling function once per image. +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) +#if defined(_MSC_VER) +#define EMMS() __asm emms +#pragma warning(disable: 4799) +#else +#define EMMS() asm("emms") +#endif +#else +#define EMMS() ((void)0) +#endif + +} // extern "C" + +#endif // MEDIA_BASE_YUV_ROW_H_ diff --git a/gfx/ycbcr/yuv_row_arm.s b/gfx/ycbcr/yuv_row_arm.s new file mode 100644 index 0000000000..6a6c81beeb --- /dev/null +++ b/gfx/ycbcr/yuv_row_arm.s @@ -0,0 +1,304 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + .arch armv7-a + .fpu neon +/* Allow to build on targets not supporting neon, and force the object file + * target to avoid bumping the final binary target */ + .object_arch armv4t + .text + .align + + .balign 64 +YCbCr42xToRGB565_DITHER03_CONSTS_NEON: + .short -14240 + .short -14240+384 + .short 8672 + .short 8672+192 + .short -17696 + .short -17696+384 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER12_CONSTS_NEON: + .short -14240+128 + .short -14240+256 + .short 8672+64 + .short 8672+128 + .short -17696+128 + .short -17696+256 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER21_CONSTS_NEON: + .short -14240+256 + .short -14240+128 + .short 8672+128 + .short 8672+64 + .short -17696+256 + .short -17696+128 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER30_CONSTS_NEON: + .short -14240+384 + .short -14240 + .short 8672+192 + .short 8672 + .short -17696+384 + .short -17696 + .byte 102 + .byte 25 + .byte 52 + .byte 129 + +@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( +@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); +@ +@ ctx = { +@ uint16_t *rgb_row; /*r0*/ +@ const uint8_t *y_row; /*r1*/ +@ const uint8_t *u_row; /*r2*/ +@ const uint8_t *v_row; /*r3*/ +@ int y_yweight; /*r4*/ +@ int y_pitch; /*r5*/ +@ int width; /*r6*/ +@ int source_x0_q16; /*r7*/ +@ int source_dx_q16; /*r8*/ +@ int source_uv_xoffs_q16; /*r9*/ +@ }; + .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON + .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function + .balign 64 + .fnstart +ScaleYCbCr42xToRGB565_BilinearY_Row_NEON: + STMFD r13!,{r4-r9,r14} @ 8 words. + ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON + VPUSH {Q4-Q7} @ 16 words. + ADD r14,r14,r1, LSL #4 @ Select the dither table to use + LDMIA r0, {r0-r9} + @ Set up image index registers. + ADD r12,r8, r8 + VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16 + VDUP.32 D17,r12 + ADD r12,r12,r12 + VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16 + VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16 + ADD r12,r12,r12 + VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16 + VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16 + CMP r8, #0 @ If source_dx_q16 is negative... + VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16 + ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block + VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16 + SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use) + VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16 + VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16 + VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16 + VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16 + VLD1.64 {D30,D31},[r14,:128] @ Load some constants + VMOV.I8 D28,#52 + VMOV.I8 D29,#129 + @ The basic idea here is to do aligned loads of a block of data and then + @ index into it using VTBL to extract the data from the source X + @ coordinate corresponding to each destination pixel. + @ This is significantly less code and significantly fewer cycles than doing + @ a series of single-lane loads, but it means that the X step between + @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee + @ that we could read 8 pixels from a single aligned 32-byte block of data. + @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel, + @ separated into even pixels and odd pixels to make extracting offsets and + @ weights easier. + @ We then pull out two bytes from the middle of each coordinate: the top + @ byte corresponds to the integer part of the X coordinate, and the bottom + @ byte corresponds to the weight to use for bilinear blending. + @ These are separated out into different registers with VTRN. + @ Then by subtracting the integer X coordinate of the first pixel in the + @ data block we loaded, we produce an index register suitable for use by + @ VTBL. +s42xbily_neon_loop: + @ Load the Y' data. + MOV r12,r7, ASR #16 + VRSHRN.S32 D16,Q0, #8 + AND r12,r12,#~15 @ Read 16-byte aligned blocks + VDUP.I8 D20,r12 + ADD r12,r1, r12 @ r12 = y_row+(source_x&~7) + VRSHRN.S32 D17,Q1, #8 + PLD [r12,#64] + VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row + ADD r14,r7, r8, LSL #3 + VRSHRN.S32 D18,Q2, #8 + MOV r14,r14,ASR #16 + VRSHRN.S32 D19,Q3, #8 + AND r14,r14,#~15 @ Read 16-byte aligned blocks + VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row + PLD [r12,#64] + VDUP.I8 D21,r14 + ADD r14,r1, r14 @ r14 = y_row+(source_x&~7) + VMOV.I8 Q13,#1 + PLD [r14,#64] + VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> + @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> + VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded. + @ First 8 Y' pixels + VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x + VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x + VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x + VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1 + VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1 + @ Next 8 Y' pixels + VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row + PLD [r14,#64] + VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x + VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x + VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1 + VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1 + @ Blend Y'. + VDUP.I16 Q9, r4 @ Load the y weights. + VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a + VSUBL.U8 Q5, D25,D21 + VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b + VSUBL.U8 Q7, D27,D23 + VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight + VMUL.S16 Q5, Q5, Q9 + VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight + VMUL.S16 Q7, Q7, Q9 + VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits. + VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW. + VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8 + VRSHRN.S16 D9, Q5, #8 + VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8 + VRSHRN.S16 D13,Q7, #8 + VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8) + VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8) + VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a + VSUBL.U8 Q5, D23,D21 + VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight + VMUL.S16 Q5, Q5, Q13 + VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8 + ADD r12,r7, r9 + VRSHRN.S16 D9, Q5, #8 + MOV r12,r12,ASR #17 + VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8) + @ Start extracting the chroma x coordinates, and load Cb and Cr. + AND r12,r12,#~15 @ Read 16-byte aligned blocks + VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4 + ADD r14,r2, r12 + VADD.I32 Q10,Q0, Q9 + VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb + PLD [r14,#64] + VADD.I32 Q11,Q1, Q9 + ADD r14,r3, r12 + VADD.I32 Q12,Q2, Q9 + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr + PLD [r14,#64] + VADD.I32 Q13,Q3, Q9 + VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0> + VRSHRN.S32 D21,Q11,#9 + VDUP.I8 Q9, r12 + VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1> + VRSHRN.S32 D23,Q13,#9 + @ We don't actually need the x weights, but we get them for free. + @ Free ALU slot + VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> + @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> + VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded. + VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x + VMOV.I8 D24,#74 + VTBL.8 D19,{D8, D9, D10,D11},D23 + VMOV.I8 D26,#102 + VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x + VMOV.I8 D27,#25 + VTBL.8 D21,{D12,D13,D14,D15},D23 + @ We now have Y' in Q8, Cb in Q9, and Cr in Q10 + @ We use VDUP to expand constants, because it's a permute instruction, so + @ it can dual issue on the A8. + SUBS r6, r6, #16 @ width -= 16 + VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74 + VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G + VMULL.U8 Q5, D17,D24 + VDUP.32 Q7, D30[1] + VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G + VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R + VMLSL.U8 Q7, D19,D27 + VDUP.32 Q12,D30[0] + VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R + VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B + VMLAL.U8 Q12,D21,D26 + VDUP.32 Q13,D31[0] + VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B + VMLAL.U8 Q13,D19,D29 + VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G + VMLSL.U8 Q7, D21,D28 + VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R + VADD.S16 Q12,Q5, Q12 + VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B + VQADD.S16 Q13,Q5, Q13 + VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G + VADD.S16 Q7, Q5, Q7 + @ Push each value to the top of its word and saturate it. + VQSHLU.S16 Q11,Q11,#2 + VQSHLU.S16 Q12,Q12,#2 + VQSHLU.S16 Q6, Q6, #2 + VQSHLU.S16 Q7, Q7, #2 + VQSHLU.S16 Q8, Q8, #2 + VQSHLU.S16 Q13,Q13,#2 + @ Merge G and B into R. + VSRI.U16 Q11,Q6, #5 + VSRI.U16 Q12,Q7, #5 + VSRI.U16 Q11,Q8, #11 + MOV r14,r8, LSL #4 + VSRI.U16 Q12,Q13,#11 + BLT s42xbily_neon_tail + VDUP.I32 Q13,r14 + @ Store the result. + VST1.16 {D22,D23,D24,D25},[r0]! + BEQ s42xbily_neon_done + @ Advance the x coordinates. + VADD.I32 Q0, Q0, Q13 + VADD.I32 Q1, Q1, Q13 + ADD r7, r14 + VADD.I32 Q2, Q2, Q13 + VADD.I32 Q3, Q3, Q13 + B s42xbily_neon_loop +s42xbily_neon_tail: + @ We have between 1 and 15 pixels left to write. + @ -r6 == the number of pixels we need to skip writing. + @ Adjust r0 to point to the last one we need to write, because we're going + @ to write them in reverse order. + ADD r0, r0, r6, LSL #1 + MOV r14,#-2 + ADD r0, r0, #30 + @ Skip past the ones we don't need to write. + SUB PC, PC, r6, LSL #2 + ORR r0, r0, r0 + VST1.16 {D25[3]},[r0,:16],r14 + VST1.16 {D25[2]},[r0,:16],r14 + VST1.16 {D25[1]},[r0,:16],r14 + VST1.16 {D25[0]},[r0,:16],r14 + VST1.16 {D24[3]},[r0,:16],r14 + VST1.16 {D24[2]},[r0,:16],r14 + VST1.16 {D24[1]},[r0,:16],r14 + VST1.16 {D24[0]},[r0,:16],r14 + VST1.16 {D23[3]},[r0,:16],r14 + VST1.16 {D23[2]},[r0,:16],r14 + VST1.16 {D23[1]},[r0,:16],r14 + VST1.16 {D23[0]},[r0,:16],r14 + VST1.16 {D22[3]},[r0,:16],r14 + VST1.16 {D22[2]},[r0,:16],r14 + VST1.16 {D22[1]},[r0,:16],r14 + VST1.16 {D22[0]},[r0,:16] +s42xbily_neon_done: + VPOP {Q4-Q7} @ 16 words. + LDMFD r13!,{r4-r9,PC} @ 8 words. + .fnend + .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON + +#if defined(__ELF__)&&defined(__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp new file mode 100644 index 0000000000..b986451ec2 --- /dev/null +++ b/gfx/ycbcr/yuv_row_c.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +#define DCHECK(a) + +extern "C" { + +// C reference code that mimic the YUV assembly. +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* rgb_buf) { + + int b = kCoefficientsRgbY[256+u][0]; + int g = kCoefficientsRgbY[256+u][1]; + int r = kCoefficientsRgbY[256+u][2]; + int a = kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, kCoefficientsRgbY[512+v][0]); + g = paddsw(g, kCoefficientsRgbY[512+v][1]); + r = paddsw(r, kCoefficientsRgbY[512+v][2]); + a = paddsw(a, kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, kCoefficientsRgbY[y][0]); + g = paddsw(g, kCoefficientsRgbY[y][1]); + r = paddsw(r, kCoefficientsRgbY[y][2]); + a = paddsw(a, kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast<uint32_t*>(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); +} + +void FastConvertYUVToRGB32Row_C(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + unsigned int x_shift) { + for (int x = 0; x < width; x += 2) { + uint8_t u = u_buf[x >> x_shift]; + uint8_t v = v_buf[x >> x_shift]; + uint8_t y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8_t y1 = y_buf[x + 1]; + if (x_shift == 0) { + u = u_buf[x + 1]; + v = v_buf[x + 1]; + } + YuvPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +// 16.16 fixed point is used. A shift by 16 isolates the integer. +// A shift by 17 is used to further subsample the chrominence channels. +// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +// for 1/65536 pixel accurate interpolation. +void ScaleYUVToRGB32Row_C(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + int x = 0; + for (int i = 0; i < width; i += 2) { + int y = y_buf[x >> 16]; + int u = u_buf[(x >> 17)]; + int v = v_buf[(x >> 17)]; + YuvPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y = y_buf[x >> 16]; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +void LinearScaleYUVToRGB32Row_C(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + int x = 0; + if (source_dx >= 0x20000) { + x = 32768; + } + for (int i = 0; i < width; i += 2) { + int y0 = y_buf[x >> 16]; + int y1 = y_buf[(x >> 16) + 1]; + int u0 = u_buf[(x >> 17)]; + int u1 = u_buf[(x >> 17) + 1]; + int v0 = v_buf[(x >> 17)]; + int v1 = v_buf[(x >> 17) + 1]; + int y_frac = (x & 65535); + int uv_frac = ((x >> 1) & 65535); + int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; + int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; + YuvPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 65535); + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +} // extern "C" + diff --git a/gfx/ycbcr/yuv_row_other.cpp b/gfx/ycbcr/yuv_row_other.cpp new file mode 100644 index 0000000000..437f90476d --- /dev/null +++ b/gfx/ycbcr/yuv_row_other.cpp @@ -0,0 +1,34 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +} diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp new file mode 100644 index 0000000000..c5e55abe4c --- /dev/null +++ b/gfx/ycbcr/yuv_row_posix.cpp @@ -0,0 +1,914 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" +#include "mozilla/SSE.h" + +#define DCHECK(a) + +extern "C" { + +#if defined(ARCH_CPU_X86_64) + +// We don't need CPUID guards here, since x86-64 implies SSE2. + +// AMD64 ABI uses register paremters. +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, // rdi + const uint8_t* u_buf, // rsi + const uint8_t* v_buf, // rdx + uint8_t* rgb_buf, // rcx + int width) { // r8 + asm volatile( + "jmp 1f\n" +"0:" + "movzb (%[u_buf]),%%r10\n" + "add $0x1,%[u_buf]\n" + "movzb (%[v_buf]),%%r11\n" + "add $0x1,%[v_buf]\n" + "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" + "movzb (%[y_buf]),%%r10\n" + "movq 4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n" + "movzb 0x1(%[y_buf]),%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n" + "add $0x2,%[y_buf]\n" + "movq (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%[rgb_buf])\n" + "add $0x8,%[rgb_buf]\n" +"1:" + "sub $0x2,%[width]\n" + "jns 0b\n" + +"2:" + "add $0x1,%[width]\n" + "js 3f\n" + + "movzb (%[u_buf]),%%r10\n" + "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" + "movzb (%[v_buf]),%%r10\n" + "movq 4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "movzb (%[y_buf]),%%r10\n" + "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%[rgb_buf])\n" +"3:" + : [y_buf] "+r"(y_buf), + [u_buf] "+r"(u_buf), + [v_buf] "+r"(v_buf), + [rgb_buf] "+r"(rgb_buf), + [width] "+r"(width) + : [kCoefficientsRgbY] "r" (kCoefficientsRgbY) + : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, // rdi + const uint8_t* u_buf, // rsi + const uint8_t* v_buf, // rdx + uint8_t* rgb_buf, // rcx + int width, // r8 + int source_dx) { // r9 + asm volatile( + "xor %%r11,%%r11\n" + "sub $0x2,%[width]\n" + "js 1f\n" + +"0:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%[u_buf],%%r10,1),%%rax\n" + "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n" + "movzb (%[v_buf],%%r10,1),%%rax\n" + "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" + "lea (%%r11,%[source_dx]),%%r10\n" + "sar $0x10,%%r11\n" + "movzb (%[y_buf],%%r11,1),%%rax\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" + "lea (%%r10,%[source_dx]),%%r11\n" + "sar $0x10,%%r10\n" + "movzb (%[y_buf],%%r10,1),%%rax\n" + "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n" + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%[rgb_buf])\n" + "add $0x8,%[rgb_buf]\n" + "sub $0x2,%[width]\n" + "jns 0b\n" + +"1:" + "add $0x1,%[width]\n" + "js 2f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%[u_buf],%%r10,1),%%rax\n" + "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n" + "movzb (%[v_buf],%%r10,1),%%rax\n" + "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "sar $0x10,%%r11\n" + "movzb (%[y_buf],%%r11,1),%%rax\n" + "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%[rgb_buf])\n" + +"2:" + : [rgb_buf] "+r"(rgb_buf), + [width] "+r"(width) + : [y_buf] "r"(y_buf), + [u_buf] "r"(u_buf), + [v_buf] "r"(v_buf), + [kCoefficientsRgbY] "r" (kCoefficientsRgbY), + [source_dx] "r"(static_cast<long>(source_dx)) + : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" +); +} + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + asm volatile( + "xor %%r11,%%r11\n" // x = 0 + "sub $0x2,%[width]\n" + "js 2f\n" + "cmp $0x20000,%[source_dx]\n" // if source_dx >= 2.0 + "jl 0f\n" + "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less +"0:" + +"1:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%[u_buf], %%r10, 1), %%r13 \n" + "movzb 1(%[u_buf], %%r10, 1), %%r14 \n" + "mov %%r11, %%rax \n" + "and $0x1fffe, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0x1fffe, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $17, %%r13 \n" + "movq 2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n" + + "movzb (%[v_buf], %%r10, 1), %%r13 \n" + "movzb 1(%[v_buf], %%r10, 1), %%r14 \n" + "mov %%r11, %%rax \n" + "and $0x1fffe, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0x1fffe, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $17, %%r13 \n" + "movq 4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n" + + "mov %%r11, %%rax \n" + "lea (%%r11,%[source_dx]),%%r10\n" + "sar $0x10,%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + + "movzb (%[y_buf], %%r11, 1), %%r13 \n" + "movzb 1(%[y_buf], %%r11, 1), %%r14 \n" + "and $0xffff, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0xffff, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $16, %%r13 \n" + "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" + + "mov %%r10, %%rax \n" + "lea (%%r10,%[source_dx]),%%r11\n" + "sar $0x10,%%r10\n" + + "movzb (%[y_buf],%%r10,1), %%r13 \n" + "movzb 1(%[y_buf],%%r10,1), %%r14 \n" + "and $0xffff, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0xffff, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $16, %%r13 \n" + "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n" + + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%[rgb_buf])\n" + "add $0x8,%[rgb_buf]\n" + "sub $0x2,%[width]\n" + "jns 1b\n" + +"2:" + "add $0x1,%[width]\n" + "js 3f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%[u_buf],%%r10,1), %%r13 \n" + "movq 2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n" + + "movzb (%[v_buf],%%r10,1), %%r13 \n" + "movq 4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" + + "paddsw %%xmm1,%%xmm0\n" + "sar $0x10,%%r11\n" + + "movzb (%[y_buf],%%r11,1), %%r13 \n" + "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" + + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%[rgb_buf])\n" + +"3:" + : [rgb_buf] "+r"(rgb_buf), + [width] "+r"(width) + : [y_buf] "r"(y_buf), + [u_buf] "r"(u_buf), + [v_buf] "r"(v_buf), + [kCoefficientsRgbY] "r" (kCoefficientsRgbY), + [source_dx] "r"(static_cast<long>(source_dx)) + : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" +); +} + +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) + +// PIC version is slower because less registers are available, so +// non-PIC is used on platforms where it is possible. +void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width); + asm( + ".text\n" + ".global FastConvertYUVToRGB32Row_SSE\n" + ".type FastConvertYUVToRGB32Row_SSE, @function\n" +"FastConvertYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "jmp 1f\n" + +"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "add $0x2,%edx\n" + "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "sub $0x2,%ecx\n" + "jns 0b\n" + + "and $0x1,%ecx\n" + "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) +{ + if (mozilla::supports_sse()) { + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); + return; + } + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + + +void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + asm( + ".text\n" + ".global ScaleYUVToRGB32Row_SSE\n" + ".type ScaleYUVToRGB32Row_SSE, @function\n" +"ScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "xor %ebx,%ebx\n" + "jmp 1f\n" + +"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "sub $0x2,%ecx\n" + "jns 0b\n" + + "and $0x1,%ecx\n" + "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); + return; + } + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); +} + +void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx); + asm( + ".text\n" + ".global LinearScaleYUVToRGB32Row_SSE\n" + ".type LinearScaleYUVToRGB32Row_SSE, @function\n" +"LinearScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x30(%esp),%ebp\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 + "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less + "jmp 1f\n" + +"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%edi,%eax,1),%ecx\n" + "movzbl 1(%edi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" + + "mov 0x2c(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" + "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" + + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +"1:" + "cmp 0x34(%esp), %ebx\n" + "jl 0b\n" + "popa\n" + "ret\n" + +"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); + return; + } + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); +} + +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) + +void PICConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + const int16_t *kCoefficientsRgbY); + + asm( + ".text\n" +#if defined(XP_MACOSX) +"_PICConvertYUVToRGB32Row_SSE:\n" +#else +"PICConvertYUVToRGB32Row_SSE:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x38(%esp),%ecx\n" + + "jmp 1f\n" + +"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw 4096(%ecx,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "add $0x2,%edx\n" + "movq 0(%ecx,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "subl $0x2,0x34(%esp)\n" + "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" + "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) +{ + if (mozilla::supports_sse()) { + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + &kCoefficientsRgbY[0][0]); + return; + } + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void PICScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx, + const int16_t *kCoefficientsRgbY); + + asm( + ".text\n" +#if defined(XP_MACOSX) +"_PICScaleYUVToRGB32Row_SSE:\n" +#else +"PICScaleYUVToRGB32Row_SSE:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x3c(%esp),%ecx\n" + "xor %ebx,%ebx\n" + "jmp 1f\n" + +"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "subl $0x2,0x34(%esp)\n" + "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" + "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, + &kCoefficientsRgbY[0][0]); + return; + } + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void PICLinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx, + const int16_t *kCoefficientsRgbY); + + asm( + ".text\n" +#if defined(XP_MACOSX) +"_PICLinearScaleYUVToRGB32Row_SSE:\n" +#else +"PICLinearScaleYUVToRGB32Row_SSE:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "mov 0x3c(%esp),%edi\n" + "xor %ebx,%ebx\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 + "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less + "jmp 1f\n" + +"0:" + "mov 0x28(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "movq 2048(%edi,%ecx,8),%mm0\n" + + "mov 0x2c(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "paddsw 4096(%edi,%ecx,8),%mm0\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq (%edi,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" + "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq (%edi,%ecx,8),%mm2\n" + + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +"1:" + "cmp %ebx, 0x34(%esp)\n" + "jg 0b\n" + "popa\n" + "ret\n" + +"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx, &kCoefficientsRgbY[0][0]); + return; + } + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} +#else +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} +#endif + +} diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp new file mode 100644 index 0000000000..a916ffde57 --- /dev/null +++ b/gfx/ycbcr/yuv_row_table.cpp @@ -0,0 +1,233 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { + +#define RGBY(i) { \ + static_cast<int16_t>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16_t>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16_t>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ +} + +#define RGBU(i) { \ + static_cast<int16_t>(2.018 * 64 * (i - 128) + 0.5), \ + static_cast<int16_t>(-0.391 * 64 * (i - 128) + 0.5), \ + 0, \ + static_cast<int16_t>(256 * 64 - 1) \ +} + +#define RGBV(i) { \ + 0, \ + static_cast<int16_t>(-0.813 * 64 * (i - 128) + 0.5), \ + static_cast<int16_t>(1.596 * 64 * (i - 128) + 0.5), \ + 0 \ +} + +SIMD_ALIGNED(const int16_t kCoefficientsRgbY[256 * 3][4]) = { + RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), + RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), + RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), + RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), + RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), + RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), + RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), + RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), + RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), + RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), + RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), + RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), + RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), + RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), + RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), + RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), + RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), + RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), + RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), + RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), + RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), + RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), + RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), + RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), + RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), + RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), + RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), + RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), + RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), + RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), + RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), + RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), + RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), + RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), + RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), + RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), + RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), + RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), + RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), + RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), + RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), + RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), + RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), + RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), + RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), + RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), + RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), + RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), + RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), + RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), + RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), + RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), + RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), + RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), + RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), + RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), + RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), + RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), + RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), + RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), + RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), + RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), + RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), + RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), + + // Chroma U table. + RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), + RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), + RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), + RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), + RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), + RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), + RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), + RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), + RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), + RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), + RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), + RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), + RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), + RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), + RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), + RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), + RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), + RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), + RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), + RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), + RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), + RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), + RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), + RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), + RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), + RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), + RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), + RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), + RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), + RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), + RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), + RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), + RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), + RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), + RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), + RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), + RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), + RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), + RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), + RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), + RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), + RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), + RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), + RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), + RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), + RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), + RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), + RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), + RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), + RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), + RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), + RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), + RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), + RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), + RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), + RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), + RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), + RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), + RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), + RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), + RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), + RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), + RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), + RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), + + // Chroma V table. + RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), + RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), + RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), + RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), + RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), + RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), + RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), + RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), + RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), + RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), + RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), + RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), + RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), + RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), + RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), + RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), + RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), + RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), + RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), + RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), + RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), + RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), + RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), + RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), + RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), + RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), + RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), + RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), + RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), + RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), + RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), + RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), + RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), + RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), + RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), + RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), + RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), + RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), + RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), + RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), + RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), + RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), + RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), + RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), + RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), + RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), + RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), + RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), + RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), + RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), + RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), + RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), + RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), + RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), + RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), + RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), + RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), + RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), + RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), + RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), + RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), + RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), + RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), + RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), +}; + +#undef RGBY +#undef RGBU +#undef RGBV + +} // extern "C" diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp new file mode 100644 index 0000000000..c496b2d935 --- /dev/null +++ b/gfx/ycbcr/yuv_row_win.cpp @@ -0,0 +1,506 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" +#include "mozilla/SSE.h" + +#define kCoefficientsRgbU kCoefficientsRgbY + 2048 +#define kCoefficientsRgbV kCoefficientsRgbY + 4096 + +extern "C" { + +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) +#if defined(__clang__) +// clang-cl has a bug where it doesn't mangle names in inline asm +// so let's do the mangling in the preprocessor (ugh) +// (but we still need to declare a dummy extern for the parser) +extern void* _kCoefficientsRgbY; +#define kCoefficientsRgbY _kCoefficientsRgbY +#endif + +__declspec(naked) +void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp convertend + + convertloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movzx ebx, byte ptr [edx + 1] + movq mm1, [kCoefficientsRgbY + 8 * eax] + add edx, 2 + movq mm2, [kCoefficientsRgbY + 8 * ebx] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + convertend : + sub ecx, 2 + jns convertloop + + and ecx, 1 // odd number of pixels? + jz convertdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + convertdone : + + popad + ret + } +} + +__declspec(naked) +void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int step) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + mov ebx, [esp + 32 + 24] // step + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int ystep, + int uvstep) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + mov ebx, [esp + 32 + 28] // uvstep + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + mov ebx, [esp + 32 + 24] // ystep + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + punpckldq mm1, mm1 + movntq [ebp], mm1 + + movzx ebx, byte ptr [edx + 1] + add edx, 2 + paddsw mm0, [kCoefficientsRgbY + 8 * ebx] + psraw mm0, 6 + packuswb mm0, mm0 + punpckldq mm0, mm0 + movntq [ebp+8], mm0 + add ebp, 16 + wend : + sub ecx, 4 + jns wloop + + add ecx, 4 + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + jmp wend1 + + wloop1 : + movd [ebp], mm1 + add ebp, 4 + wend1 : + sub ecx, 1 + jns wloop1 + wdone : + popad + ret + } +} + +// This version does general purpose scaling by any amount, up or down. +// The only thing it cannot do is rotation by 90 or 270. +// For performance the chroma is under-sampled, reducing cost of a 3x +// 1080p scale from 8.4 ms to 5.4 ms. +__declspec(naked) +void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + xor ebx, ebx // x + jmp scaleend + + scaleloop : + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += source_dx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += source_dx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + scaleend : + sub ecx, 2 + jns scaleloop + + and ecx, 1 // odd number of pixels? + jz scaledone + + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + + scaledone : + popad + ret + } +} + +__declspec(naked) +void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + // [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + imul ecx, [esp + 32 + 24] // source_dx + mov [esp + 32 + 20], ecx // source_width = width * source_dx + mov ecx, [esp + 32 + 24] // source_dx + xor ebx, ebx // x = 0 + cmp ecx, 0x20000 + jl lscaleend + mov ebx, 0x8000 // x = 0.5 for 1/2 or less + jmp lscaleend +lscaleloop: + mov eax, ebx + sar eax, 0x11 + + movzx ecx, byte ptr [edi + eax] + movzx esi, byte ptr [edi + eax + 1] + mov eax, ebx + and eax, 0x1fffe + imul esi, eax + xor eax, 0x1fffe + imul ecx, eax + add ecx, esi + shr ecx, 17 + movq mm0, [kCoefficientsRgbU + 8 * ecx] + + mov esi, [esp + 32 + 12] + mov eax, ebx + sar eax, 0x11 + + movzx ecx, byte ptr [esi + eax] + movzx esi, byte ptr [esi + eax + 1] + mov eax, ebx + and eax, 0x1fffe + imul esi, eax + xor eax, 0x1fffe + imul ecx, eax + add ecx, esi + shr ecx, 17 + paddsw mm0, [kCoefficientsRgbV + 8 * ecx] + + mov eax, ebx + sar eax, 0x10 + movzx ecx, byte ptr [edx + eax] + movzx esi, byte ptr [1 + edx + eax] + mov eax, ebx + add ebx, [esp + 32 + 24] + and eax, 0xffff + imul esi, eax + xor eax, 0xffff + imul ecx, eax + add ecx, esi + shr ecx, 16 + movq mm1, [kCoefficientsRgbY + 8 * ecx] + + cmp ebx, [esp + 32 + 20] + jge lscalelastpixel + + mov eax, ebx + sar eax, 0x10 + movzx ecx, byte ptr [edx + eax] + movzx esi, byte ptr [edx + eax + 1] + mov eax, ebx + add ebx, [esp + 32 + 24] + and eax, 0xffff + imul esi, eax + xor eax, 0xffff + imul ecx, eax + add ecx, esi + shr ecx, 16 + movq mm2, [kCoefficientsRgbY + 8 * ecx] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 0x8 + +lscaleend: + cmp ebx, [esp + 32 + 20] + jl lscaleloop + popad + ret + +lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + popad + ret + }; +} +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); + return; + } +#endif + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); + return; + } +#endif + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx); + return; + } +#endif + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +} // extern "C" diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp new file mode 100644 index 0000000000..17b542449b --- /dev/null +++ b/gfx/ycbcr/yuv_row_win64.cpp @@ -0,0 +1,205 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { + +// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. + +#define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048) +#define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096) + +#include <emmintrin.h> + +static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + + while (width >= 2) { + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); + + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + uint8_t u, v, y; + int x = 0; + + while (width >= 2) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + x += source_dx; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + y = y_buf[x >> 16]; + x += source_dx; + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + uint8_t u0, u1, v0, v1, y0, y1; + uint32_t uv_frac, y_frac, u, v, y; + int x = 0; + + if (source_dx >= 0x20000) { + x = 32768; + } + + while(width >= 2) { + u0 = u_buf[x >> 17]; + u1 = u_buf[(x >> 17) + 1]; + v0 = v_buf[x >> 17]; + v1 = v_buf[(x >> 17) + 1]; + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + uv_frac = (x & 0x1fffe); + y_frac = (x & 0xffff); + u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; + v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; + x += source_dx; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 0xffff); + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; + x += source_dx; + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); + + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); +} + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx); +} + +} // extern "C" |