summaryrefslogtreecommitdiffstats
path: root/gfx/ycbcr
diff options
context:
space:
mode:
Diffstat (limited to 'gfx/ycbcr')
-rw-r--r--gfx/ycbcr/LICENSE27
-rw-r--r--gfx/ycbcr/README8
-rw-r--r--gfx/ycbcr/YCbCrUtils.cpp387
-rw-r--r--gfx/ycbcr/YCbCrUtils.h49
-rw-r--r--gfx/ycbcr/chromium_types.h50
-rw-r--r--gfx/ycbcr/moz.build66
-rw-r--r--gfx/ycbcr/scale_yuv_argb.cpp1132
-rw-r--r--gfx/ycbcr/scale_yuv_argb.h39
-rw-r--r--gfx/ycbcr/ycbcr_to_rgb565.cpp672
-rw-r--r--gfx/ycbcr/ycbcr_to_rgb565.h72
-rw-r--r--gfx/ycbcr/yuv_convert.cpp577
-rw-r--r--gfx/ycbcr/yuv_convert.h123
-rw-r--r--gfx/ycbcr/yuv_convert_arm.cpp232
-rw-r--r--gfx/ycbcr/yuv_convert_mmx.cpp45
-rw-r--r--gfx/ycbcr/yuv_convert_sse2.cpp47
-rw-r--r--gfx/ycbcr/yuv_row.h154
-rw-r--r--gfx/ycbcr/yuv_row_arm.s304
-rw-r--r--gfx/ycbcr/yuv_row_c.cpp133
-rw-r--r--gfx/ycbcr/yuv_row_other.cpp34
-rw-r--r--gfx/ycbcr/yuv_row_posix.cpp914
-rw-r--r--gfx/ycbcr/yuv_row_table.cpp233
-rw-r--r--gfx/ycbcr/yuv_row_win.cpp506
-rw-r--r--gfx/ycbcr/yuv_row_win64.cpp205
23 files changed, 6009 insertions, 0 deletions
diff --git a/gfx/ycbcr/LICENSE b/gfx/ycbcr/LICENSE
new file mode 100644
index 0000000000..8dc35041de
--- /dev/null
+++ b/gfx/ycbcr/LICENSE
@@ -0,0 +1,27 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/gfx/ycbcr/README b/gfx/ycbcr/README
new file mode 100644
index 0000000000..8910a2a2b2
--- /dev/null
+++ b/gfx/ycbcr/README
@@ -0,0 +1,8 @@
+This color conversion code is from the Chromium open source project available here:
+
+http://code.google.com/chromium/
+
+The code comes from svn revision 63840 on 2010-10-26.
+
+It has been superseded upstream by libyuv (which is spawned off it). Bug 791941 is about
+trying to replace this code with libyuv.
diff --git a/gfx/ycbcr/YCbCrUtils.cpp b/gfx/ycbcr/YCbCrUtils.cpp
new file mode 100644
index 0000000000..b2b5a4f293
--- /dev/null
+++ b/gfx/ycbcr/YCbCrUtils.cpp
@@ -0,0 +1,387 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/EndianUtils.h"
+#include "gfx2DGlue.h"
+#include "mozilla/gfx/Swizzle.h"
+
+#include "YCbCrUtils.h"
+#include "yuv_convert.h"
+#include "ycbcr_to_rgb565.h"
+#include "libyuv.h"
+
+namespace mozilla {
+namespace gfx {
+
+// clang-format off
+
+static YUVType GetYUVType(const layers::PlanarYCbCrData& aData) {
+ switch (aData.mChromaSubsampling) {
+ case ChromaSubsampling::FULL:
+ return aData.mCbCrStride > 0 ? YV24 : Y8;
+ case ChromaSubsampling::HALF_WIDTH:
+ return YV16;
+ case ChromaSubsampling::HALF_WIDTH_AND_HEIGHT:
+ return YV12;
+ }
+ MOZ_CRASH("Unknown chroma subsampling");
+}
+
+void
+GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData,
+ SurfaceFormat& aSuggestedFormat,
+ IntSize& aSuggestedSize)
+{
+ YUVType yuvtype = GetYUVType(aData);
+
+ // 'prescale' is true if the scaling is to be done as part of the
+ // YCbCr to RGB conversion rather than on the RGB data when rendered.
+ bool prescale = aSuggestedSize.width > 0 && aSuggestedSize.height > 0 &&
+ aSuggestedSize != aData.mPictureRect.Size();
+
+ if (aSuggestedFormat == SurfaceFormat::R5G6B5_UINT16) {
+#if defined(HAVE_YCBCR_TO_RGB565)
+ if (prescale &&
+ !IsScaleYCbCrToRGB565Fast(aData.mPictureRect.x,
+ aData.mPictureRect.y,
+ aData.mPictureRect.width,
+ aData.mPictureRect.height,
+ aSuggestedSize.width,
+ aSuggestedSize.height,
+ yuvtype,
+ FILTER_BILINEAR) &&
+ IsConvertYCbCrToRGB565Fast(aData.mPictureRect.x,
+ aData.mPictureRect.y,
+ aData.mPictureRect.width,
+ aData.mPictureRect.height,
+ yuvtype)) {
+ prescale = false;
+ }
+#else
+ // yuv2rgb16 function not available
+ aSuggestedFormat = SurfaceFormat::B8G8R8X8;
+#endif
+ }
+ else if (aSuggestedFormat != SurfaceFormat::B8G8R8X8) {
+ // No other formats are currently supported.
+ aSuggestedFormat = SurfaceFormat::B8G8R8X8;
+ }
+ if (aSuggestedFormat == SurfaceFormat::B8G8R8X8) {
+ /* ScaleYCbCrToRGB32 does not support a picture offset, nor 4:4:4 data.
+ See bugs 639415 and 640073. */
+ if (aData.mPictureRect.TopLeft() != IntPoint(0, 0) || yuvtype == YV24)
+ prescale = false;
+ }
+ if (!prescale) {
+ aSuggestedSize = aData.mPictureRect.Size();
+ }
+}
+
+static inline void
+ConvertYCbCr16to8Line(uint8_t* aDst,
+ int aStride,
+ const uint16_t* aSrc,
+ int aStride16,
+ int aWidth,
+ int aHeight,
+ int aBitDepth)
+{
+ // These values from from the comment on from libyuv's Convert16To8Row_C:
+ int scale;
+ switch (aBitDepth) {
+ case 10:
+ scale = 16384;
+ break;
+ case 12:
+ scale = 4096;
+ break;
+ case 16:
+ scale = 256;
+ break;
+ default:
+ MOZ_ASSERT_UNREACHABLE("invalid bit depth value");
+ return;
+ }
+
+ libyuv::Convert16To8Plane(aSrc, aStride16, aDst, aStride, scale, aWidth, aHeight);
+}
+
+void
+ConvertYCbCrToRGBInternal(const layers::PlanarYCbCrData& aData,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize,
+ unsigned char* aDestBuffer,
+ int32_t aStride)
+{
+ // ConvertYCbCrToRGB et al. assume the chroma planes are rounded up if the
+ // luma plane is odd sized. Monochrome images have 0-sized CbCr planes
+ YUVType yuvtype = GetYUVType(aData);
+
+ // Used if converting to 8 bits YUV.
+ UniquePtr<uint8_t[]> yChannel;
+ UniquePtr<uint8_t[]> cbChannel;
+ UniquePtr<uint8_t[]> crChannel;
+ layers::PlanarYCbCrData dstData;
+ const layers::PlanarYCbCrData& srcData =
+ aData.mColorDepth == ColorDepth::COLOR_8 ? aData : dstData;
+
+ if (aData.mColorDepth != ColorDepth::COLOR_8) {
+ // Convert to 8 bits data first.
+ dstData.mPictureRect = aData.mPictureRect;
+ // We align the destination stride to 32 bytes, so that libyuv can use
+ // SSE optimised code.
+ auto ySize = aData.YDataSize();
+ auto cbcrSize = aData.CbCrDataSize();
+ dstData.mYStride = (ySize.width + 31) & ~31;
+ dstData.mCbCrStride = (cbcrSize.width + 31) & ~31;
+ dstData.mYUVColorSpace = aData.mYUVColorSpace;
+ dstData.mColorDepth = ColorDepth::COLOR_8;
+ dstData.mColorRange = aData.mColorRange;
+ dstData.mChromaSubsampling = aData.mChromaSubsampling;
+
+ size_t yMemorySize = GetAlignedStride<1>(dstData.mYStride, ySize.height);
+ size_t cbcrMemorySize =
+ GetAlignedStride<1>(dstData.mCbCrStride, cbcrSize.height);
+ if (yMemorySize == 0) {
+ MOZ_DIAGNOSTIC_ASSERT(cbcrMemorySize == 0, "CbCr without Y makes no sense");
+ return;
+ }
+ yChannel = MakeUnique<uint8_t[]>(yMemorySize);
+
+ dstData.mYChannel = yChannel.get();
+
+ int bitDepth = BitDepthForColorDepth(aData.mColorDepth);
+
+ ConvertYCbCr16to8Line(dstData.mYChannel,
+ dstData.mYStride,
+ reinterpret_cast<uint16_t*>(aData.mYChannel),
+ aData.mYStride / 2,
+ ySize.width,
+ ySize.height,
+ bitDepth);
+
+ if (cbcrMemorySize) {
+ cbChannel = MakeUnique<uint8_t[]>(cbcrMemorySize);
+ crChannel = MakeUnique<uint8_t[]>(cbcrMemorySize);
+
+ dstData.mCbChannel = cbChannel.get();
+ dstData.mCrChannel = crChannel.get();
+
+ ConvertYCbCr16to8Line(dstData.mCbChannel,
+ dstData.mCbCrStride,
+ reinterpret_cast<uint16_t*>(aData.mCbChannel),
+ aData.mCbCrStride / 2,
+ cbcrSize.width,
+ cbcrSize.height,
+ bitDepth);
+
+ ConvertYCbCr16to8Line(dstData.mCrChannel,
+ dstData.mCbCrStride,
+ reinterpret_cast<uint16_t*>(aData.mCrChannel),
+ aData.mCbCrStride / 2,
+ cbcrSize.width,
+ cbcrSize.height,
+ bitDepth);
+ }
+ }
+
+ // Convert from YCbCr to RGB now, scaling the image if needed.
+ if (aDestSize != srcData.mPictureRect.Size()) {
+#if defined(HAVE_YCBCR_TO_RGB565)
+ if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) {
+ ScaleYCbCrToRGB565(srcData.mYChannel,
+ srcData.mCbChannel,
+ srcData.mCrChannel,
+ aDestBuffer,
+ srcData.mPictureRect.x,
+ srcData.mPictureRect.y,
+ srcData.mPictureRect.width,
+ srcData.mPictureRect.height,
+ aDestSize.width,
+ aDestSize.height,
+ srcData.mYStride,
+ srcData.mCbCrStride,
+ aStride,
+ yuvtype,
+ FILTER_BILINEAR);
+ } else
+#endif
+ ScaleYCbCrToRGB32(srcData.mYChannel, //
+ srcData.mCbChannel,
+ srcData.mCrChannel,
+ aDestBuffer,
+ srcData.mPictureRect.width,
+ srcData.mPictureRect.height,
+ aDestSize.width,
+ aDestSize.height,
+ srcData.mYStride,
+ srcData.mCbCrStride,
+ aStride,
+ yuvtype,
+ srcData.mYUVColorSpace,
+ FILTER_BILINEAR);
+ } else { // no prescale
+#if defined(HAVE_YCBCR_TO_RGB565)
+ if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) {
+ ConvertYCbCrToRGB565(srcData.mYChannel,
+ srcData.mCbChannel,
+ srcData.mCrChannel,
+ aDestBuffer,
+ srcData.mPictureRect.x,
+ srcData.mPictureRect.y,
+ srcData.mPictureRect.width,
+ srcData.mPictureRect.height,
+ srcData.mYStride,
+ srcData.mCbCrStride,
+ aStride,
+ yuvtype);
+ } else // aDestFormat != SurfaceFormat::R5G6B5_UINT16
+#endif
+ ConvertYCbCrToRGB32(srcData.mYChannel, //
+ srcData.mCbChannel,
+ srcData.mCrChannel,
+ aDestBuffer,
+ srcData.mPictureRect.x,
+ srcData.mPictureRect.y,
+ srcData.mPictureRect.width,
+ srcData.mPictureRect.height,
+ srcData.mYStride,
+ srcData.mCbCrStride,
+ aStride,
+ yuvtype,
+ srcData.mYUVColorSpace,
+ srcData.mColorRange);
+ }
+}
+
+void ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize, unsigned char* aDestBuffer,
+ int32_t aStride) {
+ ConvertYCbCrToRGBInternal(aData, aDestFormat, aDestSize, aDestBuffer,
+ aStride);
+#if MOZ_BIG_ENDIAN()
+ // libyuv makes endian-correct result, which needs to be swapped to BGRX
+ if (aDestFormat != SurfaceFormat::R5G6B5_UINT16) {
+ gfx::SwizzleData(aDestBuffer, aStride, gfx::SurfaceFormat::X8R8G8B8,
+ aDestBuffer, aStride, gfx::SurfaceFormat::B8G8R8X8,
+ aDestSize);
+ }
+#endif
+}
+
+void FillAlphaToRGBA(const uint8_t* aAlpha, const int32_t aAlphaStride,
+ uint8_t* aBuffer, const int32_t aWidth,
+ const int32_t aHeight, const gfx::SurfaceFormat& aFormat) {
+ MOZ_ASSERT(aAlphaStride >= aWidth);
+ MOZ_ASSERT(aFormat ==
+ SurfaceFormat::B8G8R8A8); // required for SurfaceFormatBit::OS_A
+
+ const int bpp = BytesPerPixel(aFormat);
+ const size_t rgbaStride = aWidth * bpp;
+ const uint8_t* src = aAlpha;
+ for (int32_t h = 0; h < aHeight; ++h) {
+ size_t offset = static_cast<size_t>(SurfaceFormatBit::OS_A) / 8;
+ for (int32_t w = 0; w < aWidth; ++w) {
+ aBuffer[offset] = src[w];
+ offset += bpp;
+ }
+ src += aAlphaStride;
+ aBuffer += rgbaStride;
+ }
+}
+
+void ConvertYCbCrAToARGB(const layers::PlanarYCbCrData& aYCbCr,
+ const layers::PlanarAlphaData& aAlpha,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize, unsigned char* aDestBuffer,
+ int32_t aStride, PremultFunc premultiplyAlphaOp) {
+ // libyuv makes endian-correct result, so the format needs to be B8G8R8A8.
+ MOZ_ASSERT(aDestFormat == SurfaceFormat::B8G8R8A8);
+ MOZ_ASSERT(aAlpha.mSize == aYCbCr.YDataSize());
+
+ // libyuv has libyuv::I420AlphaToARGB, but lacks support for 422 and 444.
+ // Until that's added, we'll rely on our own code to handle this more
+ // generally, rather than have a special case and more redundant code.
+
+ UniquePtr<uint8_t[]> alphaChannel;
+ int32_t alphaStride8bpp = 0;
+ uint8_t* alphaChannel8bpp = nullptr;
+
+ // This function converts non-8-bpc images to 8-bpc. (Bug 1682322)
+ ConvertYCbCrToRGBInternal(aYCbCr, aDestFormat, aDestSize, aDestBuffer,
+ aStride);
+
+ if (aYCbCr.mColorDepth != ColorDepth::COLOR_8) {
+ // These two lines are borrowed from ConvertYCbCrToRGBInternal, since
+ // there's not a very elegant way of sharing the logic that I can see
+ alphaStride8bpp = (aAlpha.mSize.width + 31) & ~31;
+ size_t alphaSize =
+ GetAlignedStride<1>(alphaStride8bpp, aAlpha.mSize.height);
+
+ alphaChannel = MakeUnique<uint8_t[]>(alphaSize);
+
+ ConvertYCbCr16to8Line(alphaChannel.get(), alphaStride8bpp,
+ reinterpret_cast<uint16_t*>(aAlpha.mChannel),
+ aYCbCr.mYStride / 2, aAlpha.mSize.width,
+ aAlpha.mSize.height,
+ BitDepthForColorDepth(aYCbCr.mColorDepth));
+
+ alphaChannel8bpp = alphaChannel.get();
+ } else {
+ alphaStride8bpp = aYCbCr.mYStride;
+ alphaChannel8bpp = aAlpha.mChannel;
+ }
+
+ MOZ_ASSERT(alphaStride8bpp != 0);
+ MOZ_ASSERT(alphaChannel8bpp);
+
+ FillAlphaToRGBA(alphaChannel8bpp, alphaStride8bpp, aDestBuffer,
+ aYCbCr.mPictureRect.width, aYCbCr.mPictureRect.height, aDestFormat);
+
+ if (premultiplyAlphaOp) {
+ DebugOnly<int> err =
+ premultiplyAlphaOp(aDestBuffer, aStride, aDestBuffer, aStride,
+ aYCbCr.mPictureRect.width, aYCbCr.mPictureRect.height);
+ MOZ_ASSERT(!err);
+ }
+
+#if MOZ_BIG_ENDIAN()
+ // libyuv makes endian-correct result, which needs to be swapped to BGRA
+ gfx::SwizzleData(aDestBuffer, aStride, gfx::SurfaceFormat::A8R8G8B8,
+ aDestBuffer, aStride, gfx::SurfaceFormat::B8G8R8A8,
+ aYCbCr.mPictureRect.Size());
+#endif
+}
+
+void
+ConvertI420AlphaToARGB(const uint8_t* aSrcY,
+ const uint8_t* aSrcU,
+ const uint8_t* aSrcV,
+ const uint8_t* aSrcA,
+ int aSrcStrideYA, int aSrcStrideUV,
+ uint8_t* aDstARGB, int aDstStrideARGB,
+ int aWidth, int aHeight) {
+
+ ConvertI420AlphaToARGB32(aSrcY,
+ aSrcU,
+ aSrcV,
+ aSrcA,
+ aDstARGB,
+ aWidth,
+ aHeight,
+ aSrcStrideYA,
+ aSrcStrideUV,
+ aDstStrideARGB);
+#if MOZ_BIG_ENDIAN()
+ // libyuv makes endian-correct result, which needs to be swapped to BGRA
+ gfx::SwizzleData(aDstARGB, aDstStrideARGB, gfx::SurfaceFormat::A8R8G8B8,
+ aDstARGB, aDstStrideARGB, gfx::SurfaceFormat::B8G8R8A8,
+ IntSize(aWidth, aHeight));
+#endif
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/YCbCrUtils.h b/gfx/ycbcr/YCbCrUtils.h
new file mode 100644
index 0000000000..b63e4dabe9
--- /dev/null
+++ b/gfx/ycbcr/YCbCrUtils.h
@@ -0,0 +1,49 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef Y_CB_CR_UTILS_H_
+#define Y_CB_CR_UTILS_H_
+
+#include "mozilla/gfx/Types.h"
+#include "ImageContainer.h"
+
+namespace mozilla {
+namespace gfx {
+
+void
+GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData,
+ SurfaceFormat& aSuggestedFormat,
+ IntSize& aSuggestedSize);
+
+void
+ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize,
+ unsigned char* aDestBuffer,
+ int32_t aStride);
+
+using PremultFunc = int (*)(const uint8_t* src_argb, int src_stride_argb,
+ uint8_t* dst_argb, int dst_stride_argb, int width,
+ int height);
+
+void ConvertYCbCrAToARGB(const layers::PlanarYCbCrData& aYCbCr,
+ const layers::PlanarAlphaData& aAlpha,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize,
+ unsigned char* aDestBuffer,
+ int32_t aStride, PremultFunc premultiplyAlphaOp);
+
+void
+ConvertI420AlphaToARGB(const uint8_t* aSrcY,
+ const uint8_t* aSrcU,
+ const uint8_t* aSrcV,
+ const uint8_t* aSrcA,
+ int aSrcStrideYA, int aSrcStrideUV,
+ uint8_t* aDstARGB, int aDstStrideARGB,
+ int aWidth, int aHeight);
+} // namespace gfx
+} // namespace mozilla
+
+#endif /* Y_CB_CR_UTILS_H_ */
diff --git a/gfx/ycbcr/chromium_types.h b/gfx/ycbcr/chromium_types.h
new file mode 100644
index 0000000000..13f92975b5
--- /dev/null
+++ b/gfx/ycbcr/chromium_types.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef GFX_CHROMIUMTYPES_H
+#define GFX_CHROMIUMTYPES_H
+
+#include <stdint.h>
+
+#include "libyuv/basic_types.h"
+
+// From Chromium build_config.h:
+// Processor architecture detection. For more info on what's defined, see:
+// http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+// http://www.agner.org/optimize/calling_conventions.pdf
+// or with gcc, run: "echo | gcc -E -dM -"
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_64 1
+#define ARCH_CPU_64_BITS 1
+#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_32 1
+#define ARCH_CPU_X86 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARMEL 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__ppc__) || defined(__powerpc) || defined(__PPC__)
+#define ARCH_CPU_PPC_FAMILY 1
+#define ARCH_CPU_PPC 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__sparc)
+#define ARCH_CPU_SPARC_FAMILY 1
+#define ARCH_CPU_SPARC 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__sparcv9)
+#define ARCH_CPU_SPARC_FAMILY 1
+#define ARCH_CPU_SPARC 1
+#define ARCH_CPU_64_BITS 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define ARCH_CPU_AARCH64_FAMILY 1
+#define ARCH_CPU_AARCH64 1
+#define ARCH_CPU_64_BITS 1
+#else
+#warning Please add support for your architecture in chromium_types.h
+#endif
+
+#endif // GFX_CHROMIUMTYPES_H
diff --git a/gfx/ycbcr/moz.build b/gfx/ycbcr/moz.build
new file mode 100644
index 0000000000..c643fbaf40
--- /dev/null
+++ b/gfx/ycbcr/moz.build
@@ -0,0 +1,66 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+EXPORTS += [
+ 'YCbCrUtils.h',
+]
+
+UNIFIED_SOURCES += [
+ 'scale_yuv_argb.cpp',
+ 'ycbcr_to_rgb565.cpp',
+ 'YCbCrUtils.cpp',
+ 'yuv_convert.cpp',
+ 'yuv_row_c.cpp',
+ 'yuv_row_table.cpp',
+]
+
+if CONFIG['INTEL_ARCHITECTURE']:
+ # These files use MMX and SSE2 intrinsics, so they need special compile flags
+ # on some compilers.
+ SOURCES += ['yuv_convert_sse2.cpp']
+ SOURCES['yuv_convert_sse2.cpp'].flags += CONFIG['SSE2_FLAGS']
+
+ # MSVC doesn't support MMX when targeting AMD64.
+ if CONFIG['CC_TYPE'] == 'clang-cl':
+ if CONFIG['CPU_ARCH'] == 'x86':
+ SOURCES += [
+ 'yuv_convert_mmx.cpp',
+ ]
+ else:
+ SOURCES += ['yuv_convert_mmx.cpp']
+ SOURCES['yuv_convert_mmx.cpp'].flags += CONFIG['MMX_FLAGS']
+
+if CONFIG['CC_TYPE'] == 'clang-cl':
+ if CONFIG['CPU_ARCH'] == 'x86_64' or \
+ (CONFIG['CPU_ARCH'] == 'x86' and CONFIG['CC_TYPE'] == 'clang-cl'):
+ SOURCES += [
+ 'yuv_row_win64.cpp',
+ ]
+ else:
+ SOURCES += [
+ 'yuv_row_win.cpp',
+ ]
+elif CONFIG['OS_ARCH'] in ('Linux', 'SunOS', 'Darwin', 'DragonFly',
+ 'FreeBSD', 'NetBSD', 'OpenBSD'):
+ SOURCES += [
+ 'yuv_row_posix.cpp',
+ ]
+else:
+ SOURCES += [
+ 'yuv_row_other.cpp',
+ ]
+
+if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['HAVE_ARM_NEON']:
+ SOURCES += [
+ 'yuv_row_arm.s',
+ ]
+ SOURCES += [
+ 'yuv_convert_arm.cpp',
+ ]
+
+LOCAL_INCLUDES += ['/media/libyuv/libyuv/include']
+
+FINAL_LIBRARY = 'xul'
diff --git a/gfx/ycbcr/scale_yuv_argb.cpp b/gfx/ycbcr/scale_yuv_argb.cpp
new file mode 100644
index 0000000000..2a103fb61e
--- /dev/null
+++ b/gfx/ycbcr/scale_yuv_argb.cpp
@@ -0,0 +1,1132 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ * Copyright 2016 Mozilla Foundation
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/convert_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/video_common.h"
+
+#include "mozilla/gfx/Types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// YUV to RGB conversion and scaling functions were implemented by referencing
+// scale_argb.cc
+//
+// libyuv already has ScaleYUVToARGBBilinearUp(), but its implementation is not
+// completed yet. Implementations of the functions are based on it.
+// At first, ScaleYUVToARGBBilinearUp() was implemented by modifying the
+// libyuv's one. Then all another functions were implemented similarly.
+//
+// Function relationship between yuv_convert.cpp and scale_argb.cc are like
+// the followings
+// - ScaleYUVToARGBDown2() <-- ScaleARGBDown2()
+// - ScaleYUVToARGBDownEven() <-- ScaleARGBDownEven()
+// - ScaleYUVToARGBBilinearDown() <-- ScaleARGBBilinearDown()
+// - ScaleYUVToARGBBilinearUp() <-- ScaleARGBBilinearUp() and ScaleYUVToARGBBilinearUp() in libyuv
+// - ScaleYUVToARGBSimple() <-- ScaleARGBSimple()
+// - ScaleYUVToARGB() <-- ScaleARGB() // Removed some function calls for simplicity.
+// - YUVToARGBScale() <-- ARGBScale()
+//
+// Callings and selections of InterpolateRow() and ScaleARGBFilterCols() were
+// kept as same as possible.
+//
+// The followings changes were done to each scaling functions.
+//
+// -[1] Allocate YUV conversion buffer and use it as source buffer of scaling.
+// Its usage is borrowed from the libyuv's ScaleYUVToARGBBilinearUp().
+// -[2] Conversion from YUV to RGB was abstracted as YUVBuferIter.
+// It is for handling multiple yuv color formats.
+// -[3] Modified scaling functions as to handle YUV conversion buffer and
+// use YUVBuferIter.
+// -[4] Color conversion function selections in YUVBuferIter were borrowed from
+// I444ToARGBMatrix(), I422ToARGBMatrix() and I420ToARGBMatrix()
+
+typedef mozilla::gfx::YUVColorSpace YUVColorSpace;
+
+struct YUVBuferIter {
+ int src_width;
+ int src_height;
+ int src_stride_y;
+ int src_stride_u;
+ int src_stride_v;
+ const uint8_t* src_y;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+
+ uint32_t src_fourcc;
+ const struct YuvConstants* yuvconstants;
+ int y_index;
+ const uint8_t* src_row_y;
+ const uint8_t* src_row_u;
+ const uint8_t* src_row_v;
+
+ void (*YUVToARGBRow)(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+ void (*MoveTo)(YUVBuferIter& iter, int y_index);
+ void (*MoveToNextRow)(YUVBuferIter& iter);
+};
+
+void YUVBuferIter_InitI422(YUVBuferIter& iter) {
+ iter.YUVToARGBRow = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ iter.YUVToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ iter.YUVToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(iter.src_width, 16)) {
+ iter.YUVToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ iter.YUVToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(iter.src_width, 4) &&
+ IS_ALIGNED(iter.src_y, 4) && IS_ALIGNED(iter.src_stride_y, 4) &&
+ IS_ALIGNED(iter.src_u, 2) && IS_ALIGNED(iter.src_stride_u, 2) &&
+ IS_ALIGNED(iter.src_v, 2) && IS_ALIGNED(iter.src_stride_v, 2) {
+ // Always satisfy IS_ALIGNED(argb_cnv_row, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)
+ iter.YUVToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+}
+
+void YUVBuferIter_InitI444(YUVBuferIter& iter) {
+ iter.YUVToARGBRow = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ iter.YUVToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ iter.YUVToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(iter.src_width, 16)) {
+ iter.YUVToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ iter.YUVToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+}
+
+
+static void YUVBuferIter_MoveToForI444(YUVBuferIter& iter, int y_index) {
+ iter.y_index = y_index;
+ iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+ iter.src_row_u = iter.src_u + y_index * iter.src_stride_u;
+ iter.src_row_v = iter.src_v + y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI444(YUVBuferIter& iter) {
+ iter.src_row_y += iter.src_stride_y;
+ iter.src_row_u += iter.src_stride_u;
+ iter.src_row_v += iter.src_stride_v;
+ iter.y_index++;
+}
+
+static void YUVBuferIter_MoveToForI422(YUVBuferIter& iter, int y_index) {
+ iter.y_index = y_index;
+ iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+ iter.src_row_u = iter.src_u + y_index * iter.src_stride_u;
+ iter.src_row_v = iter.src_v + y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI422(YUVBuferIter& iter) {
+ iter.src_row_y += iter.src_stride_y;
+ iter.src_row_u += iter.src_stride_u;
+ iter.src_row_v += iter.src_stride_v;
+ iter.y_index++;
+}
+
+static void YUVBuferIter_MoveToForI420(YUVBuferIter& iter, int y_index) {
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int uv_y_index = y_index >> kYShift;
+
+ iter.y_index = y_index;
+ iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+ iter.src_row_u = iter.src_u + uv_y_index * iter.src_stride_u;
+ iter.src_row_v = iter.src_v + uv_y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI420(YUVBuferIter& iter) {
+ iter.src_row_y += iter.src_stride_y;
+ if (iter.y_index & 1) {
+ iter.src_row_u += iter.src_stride_u;
+ iter.src_row_v += iter.src_stride_v;
+ }
+ iter.y_index++;
+}
+
+static __inline void YUVBuferIter_ConvertToARGBRow(YUVBuferIter& iter, uint8_t* argb_row) {
+ iter.YUVToARGBRow(iter.src_row_y, iter.src_row_u, iter.src_row_v, argb_row, iter.yuvconstants, iter.src_width);
+}
+
+void YUVBuferIter_Init(YUVBuferIter& iter, uint32_t src_fourcc, YUVColorSpace yuv_color_space) {
+ iter.src_fourcc = src_fourcc;
+ iter.y_index = 0;
+ iter.src_row_y = iter.src_y;
+ iter.src_row_u = iter.src_u;
+ iter.src_row_v = iter.src_v;
+ switch (yuv_color_space) {
+ case YUVColorSpace::BT2020:
+ iter.yuvconstants = &kYuv2020Constants;
+ break;
+ case YUVColorSpace::BT709:
+ iter.yuvconstants = &kYuvH709Constants;
+ break;
+ default:
+ iter.yuvconstants = &kYuvI601Constants;
+ }
+
+ if (src_fourcc == FOURCC_I444) {
+ YUVBuferIter_InitI444(iter);
+ iter.MoveTo = YUVBuferIter_MoveToForI444;
+ iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI444;
+ } else if(src_fourcc == FOURCC_I422){
+ YUVBuferIter_InitI422(iter);
+ iter.MoveTo = YUVBuferIter_MoveToForI422;
+ iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI422;
+ } else {
+ assert(src_fourcc == FOURCC_I420); // Should be FOURCC_I420
+ YUVBuferIter_InitI422(iter);
+ iter.MoveTo = YUVBuferIter_MoveToForI420;
+ iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI420;
+ }
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleYUVToARGBDown2(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space) {
+ int j;
+
+ // Allocate 2 rows of ARGB for source conversion.
+ const int kRowSize = (src_width * 4 + 15) & ~15;
+ align_buffer_64(argb_cnv_row, kRowSize * 2);
+ uint8_t* argb_cnv_rowptr = argb_cnv_row;
+ int argb_cnv_rowstride = kRowSize;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+ void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ uint8_t* dst_argb, int dst_width) =
+ filtering == kFilterNone ? ScaleARGBRowDown2_C :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+ ScaleARGBRowDown2Box_C);
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ int yi = y >> 16;
+ iter.MoveTo(iter, yi);
+ ptrdiff_t x_offset;
+ if (filtering == kFilterBilinear) {
+ x_offset = (x >> 16) * 4;
+ } else {
+ x_offset = ((x >> 16) - 1) * 4;
+ }
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+ ScaleARGBRowDown2Box_Any_SSE2);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+ ScaleARGBRowDown2Box_SSE2);
+ }
+ }
+
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+ ScaleARGBRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+ ScaleARGBRowDown2Box_NEON);
+ }
+ }
+#endif
+
+ const int dyi = dy >> 16;
+ int lastyi = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + dyi) < (src_height - 1)) {
+ iter.MoveTo(iter, yi + dyi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ }
+
+ if (filtering == kFilterLinear) {
+ argb_cnv_rowstride = 0;
+ }
+ const int max_yi = src_height - 1;
+ const int max_yi_minus_dyi = max_yi - dyi;
+ for (j = 0; j < dst_height; ++j) {
+ if (yi != lastyi) {
+ if (yi > max_yi) {
+ yi = max_yi;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ lastyi = yi;
+ } else {
+ // Prepare current row
+ if (yi == iter.y_index) {
+ argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+ argb_cnv_rowstride = - argb_cnv_rowstride;
+ } else {
+ iter.MoveTo(iter, yi);
+ argb_cnv_rowptr = argb_cnv_row;
+ argb_cnv_rowstride = kRowSize;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ }
+ // Prepare next row if necessary
+ if (iter.y_index < max_yi) {
+ int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi;
+ iter.MoveTo(iter, next_yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ lastyi = yi;
+ }
+ }
+ }
+ ScaleARGBRowDown2(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, dst_argb, dst_width);
+ dst_argb += dst_stride_argb;
+ yi += dyi;
+ }
+
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleYUVToARGBDownEven(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space) {
+ int j;
+ // Allocate 2 rows of ARGB for source conversion.
+ const int kRowSize = (src_width * 4 + 15) & ~15;
+ align_buffer_64(argb_cnv_row, kRowSize * 2);
+ uint8_t* argb_cnv_rowptr = argb_cnv_row;
+ int argb_cnv_rowstride = kRowSize;
+
+ int col_step = dx >> 16;
+ void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ int yi = y >> 16;
+ const ptrdiff_t x_offset = (x >> 16) * 4;
+
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+ ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+ ScaleARGBRowDownEven_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+ ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+ ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+ const int dyi = dy >> 16;
+ int lastyi = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + dyi) < (src_height - 1)) {
+ iter.MoveTo(iter, yi + dyi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ }
+
+ if (filtering == kFilterLinear) {
+ argb_cnv_rowstride = 0;
+ }
+ const int max_yi = src_height - 1;
+ const int max_yi_minus_dyi = max_yi - dyi;
+ for (j = 0; j < dst_height; ++j) {
+ if (yi != lastyi) {
+ if (yi > max_yi) {
+ yi = max_yi;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ lastyi = yi;
+ } else {
+ // Prepare current row
+ if (yi == iter.y_index) {
+ argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+ argb_cnv_rowstride = - argb_cnv_rowstride;
+ } else {
+ iter.MoveTo(iter, yi);
+ argb_cnv_rowptr = argb_cnv_row;
+ argb_cnv_rowstride = kRowSize;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ }
+ // Prepare next row if necessary
+ if (iter.y_index < max_yi) {
+ int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi;
+ iter.MoveTo(iter, next_yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ lastyi = yi;
+ }
+ }
+ }
+ ScaleARGBRowDownEven(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, col_step, dst_argb, dst_width);
+ dst_argb += dst_stride_argb;
+ yi += dyi;
+ }
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale YUV to ARGB down with bilinear interpolation.
+static void ScaleYUVToARGBBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space) {
+ int j;
+ void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
+ const ptrdiff_t xl_offset = xl * 4;
+ x -= (int)(xl << 16);
+
+ // Allocate 2 row of ARGB for source conversion.
+ const int kRowSize = (src_width * 4 + 15) & ~15;
+ align_buffer_64(argb_cnv_row, kRowSize * 2);
+ uint8_t* argb_cnv_rowptr = argb_cnv_row;
+ int argb_cnv_rowstride = kRowSize;
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(clip_src_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+
+ int yi = y >> 16;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+ iter.MoveTo(iter, yi);
+
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of ARGB.
+ align_buffer_64(row, clip_src_width * 4);
+
+ int lastyi = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + 1) < src_height) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ const int max_yi = src_height - 1;
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lastyi) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ lastyi = yi;
+ } else {
+ // Prepare current row
+ if (yi == iter.y_index) {
+ argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+ argb_cnv_rowstride = - argb_cnv_rowstride;
+ } else {
+ iter.MoveTo(iter, yi);
+ argb_cnv_rowptr = argb_cnv_row;
+ argb_cnv_rowstride = kRowSize;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ }
+ // Prepare next row if necessary
+ if (iter.y_index < max_yi) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ lastyi = yi;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, argb_cnv_rowptr + xl_offset, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, argb_cnv_rowptr + xl_offset, argb_cnv_rowstride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space) {
+ int j;
+ void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ const int max_y = (src_height - 1) << 16;
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_cnv_row, src_width * 4);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols = filtering ?
+ ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ int yi = y >> 16;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+ iter.MoveTo(iter, yi);
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lastyi = yi;
+
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx);
+
+ if (filtering == kFilterLinear) {
+ rowstride = 0;
+ }
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + 1) < src_height) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx);
+ }else {
+ rowstride = 0;
+ }
+ }
+
+ const int max_yi = src_height - 1;
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lastyi) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx);
+ } else {
+ // Prepare next row if necessary
+ if (yi < max_yi) {
+ iter.MoveToNextRow(iter);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ // TODO(fbarchard): Convert the clipped region of row.
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx);
+ } else {
+ rowstride = 0;
+ }
+ }
+ lastyi = yi;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleYUVToARGBSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x, int dx, int y, int dy,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space) {
+ int j;
+ void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_cnv_row, src_width * 4);
+
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ int yi = y >> 16;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+ iter.MoveTo(iter, yi);
+
+ int lasty = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ lasty = yi;
+ }
+ ScaleARGBCols(dst_argb, argb_cnv_row, dst_width, x, dx);
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+static void YUVToARGBCopy(const uint8_t* src_y, int src_stride_y,
+ const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8_t* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space)
+{
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+ for (int j = 0; j < dst_height; ++j) {
+ YUVBuferIter_ConvertToARGBRow(iter, dst_argb);
+ iter.MoveToNextRow(iter);
+ dst_argb += dst_stride_argb;
+ }
+}
+
+static void ScaleYUVToARGB(const uint8_t* src_y, int src_stride_y,
+ const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8_t* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space)
+{
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleYUVToARGBDown2(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ ScaleYUVToARGBDownEven(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight conversion and copy.
+ YUVToARGBCopy(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ }
+ }
+ }
+ if (filtering && dy < 65536) {
+ ScaleYUVToARGBBilinearUp(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ if (filtering) {
+ ScaleYUVToARGBBilinearDown(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ ScaleYUVToARGBSimple(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ src_fourcc,
+ yuv_color_space);
+}
+
+bool IsConvertSupported(uint32_t src_fourcc)
+{
+ if (src_fourcc == FOURCC_I444 ||
+ src_fourcc == FOURCC_I422 ||
+ src_fourcc == FOURCC_I420) {
+ return true;
+ }
+ return false;
+}
+
+LIBYUV_API
+int YUVToARGBScale(const uint8_t* src_y, int src_stride_y,
+ const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint32_t src_fourcc,
+ YUVColorSpace yuv_color_space,
+ int src_width, int src_height,
+ uint8_t* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering)
+{
+ if (!src_y || !src_u || !src_v ||
+ src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ if (!IsConvertSupported(src_fourcc)) {
+ return -1;
+ }
+ ScaleYUVToARGB(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/gfx/ycbcr/scale_yuv_argb.h b/gfx/ycbcr/scale_yuv_argb.h
new file mode 100644
index 0000000000..1c4b2d16a2
--- /dev/null
+++ b/gfx/ycbcr/scale_yuv_argb.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#include "mozilla/gfx/Types.h" // For YUVColorSpace
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+int YUVToARGBScale(const uint8_t* src_y, int src_stride_y,
+ const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint32_t src_fourcc,
+ mozilla::gfx::YUVColorSpace yuv_color_space,
+ int src_width, int src_height,
+ uint8_t* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ NOLINT
diff --git a/gfx/ycbcr/ycbcr_to_rgb565.cpp b/gfx/ycbcr/ycbcr_to_rgb565.cpp
new file mode 100644
index 0000000000..fe28245a9c
--- /dev/null
+++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp
@@ -0,0 +1,672 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdlib.h>
+#include <limits.h>
+#include "nsDebug.h"
+#include "ycbcr_to_rgb565.h"
+#include "nsAlgorithm.h"
+
+
+
+#ifdef HAVE_YCBCR_TO_RGB565
+
+namespace mozilla {
+
+namespace gfx {
+
+/*This contains all of the parameters that are needed to convert a row.
+ Passing them in a struct instead of as individual parameters saves the need
+ to continually push onto the stack the ones that are fixed for every row.*/
+struct yuv2rgb565_row_scale_bilinear_ctx{
+ uint16_t *rgb_row;
+ const uint8_t *y_row;
+ const uint8_t *u_row;
+ const uint8_t *v_row;
+ int y_yweight;
+ int y_pitch;
+ int width;
+ int source_x0_q16;
+ int source_dx_q16;
+ /*Not used for 4:4:4, except with chroma-nearest.*/
+ int source_uv_xoffs_q16;
+ /*Not used for 4:4:4 or chroma-nearest.*/
+ int uv_pitch;
+ /*Not used for 4:2:2, 4:4:4, or chroma-nearest.*/
+ int uv_yweight;
+};
+
+
+
+/*This contains all of the parameters that are needed to convert a row.
+ Passing them in a struct instead of as individual parameters saves the need
+ to continually push onto the stack the ones that are fixed for every row.*/
+struct yuv2rgb565_row_scale_nearest_ctx{
+ uint16_t *rgb_row;
+ const uint8_t *y_row;
+ const uint8_t *u_row;
+ const uint8_t *v_row;
+ int width;
+ int source_x0_q16;
+ int source_dx_q16;
+ /*Not used for 4:4:4.*/
+ int source_uv_xoffs_q16;
+};
+
+
+
+typedef void (*yuv2rgb565_row_scale_bilinear_func)(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
+typedef void (*yuv2rgb565_row_scale_nearest_func)(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither);
+
+
+
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+
+extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
+void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16_t *dst,
+ const uint8_t *y,
+ const uint8_t *u,
+ const uint8_t *v,
+ int n,
+ int oddflag);
+
+#endif
+
+
+
+/*Bilinear interpolation of a single value.
+ This uses the exact same formulas as the asm, even though it adds some extra
+ shifts that do nothing but reduce accuracy.*/
+static int bislerp(const uint8_t *row,
+ int pitch,
+ int source_x,
+ int xweight,
+ int yweight) {
+ int a;
+ int b;
+ int c;
+ int d;
+ a = row[source_x];
+ b = row[source_x+1];
+ c = row[source_x+pitch];
+ d = row[source_x+pitch+1];
+ a = ((a<<8)+(c-a)*yweight+128)>>8;
+ b = ((b<<8)+(d-b)*yweight+128)>>8;
+ return ((a<<8)+(b-a)*xweight+128)>>8;
+}
+
+/*Convert a single pixel from Y'CbCr to RGB565.
+ This uses the exact same formulas as the asm, even though we could make the
+ constants a lot more accurate with 32-bit wide registers.*/
+static uint16_t yu2rgb565(int y, int u, int v, int dither) {
+ /*This combines the constant offset that needs to be added during the Y'CbCr
+ conversion with a rounding offset that depends on the dither parameter.*/
+ static const int DITHER_BIAS[4][3]={
+ {-14240, 8704, -17696},
+ {-14240+128,8704+64, -17696+128},
+ {-14240+256,8704+128,-17696+256},
+ {-14240+384,8704+192,-17696+384}
+ };
+ int r;
+ int g;
+ int b;
+ r = clamped((74*y+102*v+DITHER_BIAS[dither][0])>>9, 0, 31);
+ g = clamped((74*y-25*u-52*v+DITHER_BIAS[dither][1])>>8, 0, 63);
+ b = clamped((74*y+129*u+DITHER_BIAS[dither][2])>>9, 0, 31);
+ return (uint16_t)(r<<11 | g<<5 | b);
+}
+
+static void ScaleYCbCr420ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9;
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight);
+ v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight);
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr422ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9;
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight);
+ v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight);
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr444ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ source_x_q16 += ctx->source_dx_q16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ u = bislerp(ctx->u_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ v = bislerp(ctx->v_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr42xToRGB565_BilinearY_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr444ToRGB565_BilinearY_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>16;
+ source_x_q16 += ctx->source_dx_q16;
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr42xToRGB565_Nearest_Row_C(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){
+ int y;
+ int u;
+ int v;
+ int x;
+ int source_x_q16;
+ int source_x;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ source_x = source_x_q16>>16;
+ y = ctx->y_row[source_x];
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr444ToRGB565_Nearest_Row_C(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){
+ int y;
+ int u;
+ int v;
+ int x;
+ int source_x_q16;
+ int source_x;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ source_x = source_x_q16>>16;
+ source_x_q16 += ctx->source_dx_q16;
+ y = ctx->y_row[source_x];
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+void ScaleYCbCrToRGB565(const uint8_t *y_buf,
+ const uint8_t *u_buf,
+ const uint8_t *v_buf,
+ uint8_t *rgb_buf,
+ int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ ScaleFilter filter) {
+ int source_x0_q16;
+ int source_y0_q16;
+ int source_dx_q16;
+ int source_dy_q16;
+ int source_uv_xoffs_q16;
+ int source_uv_yoffs_q16;
+ int x_shift;
+ int y_shift;
+ int ymin;
+ int ymax;
+ int uvmin;
+ int uvmax;
+ int dither;
+ /*We don't support negative destination rectangles (just flip the source
+ instead), and for empty ones there's nothing to do.*/
+ if (width <= 0 || height <= 0)
+ return;
+ /*These bounds are required to avoid 16.16 fixed-point overflow.*/
+ NS_ASSERTION(source_x0 > (INT_MIN>>16) && source_x0 < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source X offset out of bounds.");
+ NS_ASSERTION(source_x0+source_width > (INT_MIN>>16)
+ && source_x0+source_width < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source width out of bounds.");
+ NS_ASSERTION(source_y0 > (INT_MIN>>16) && source_y0 < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source Y offset out of bounds.");
+ NS_ASSERTION(source_y0+source_height > (INT_MIN>>16)
+ && source_y0+source_height < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source height out of bounds.");
+ /*We require the same stride for Y' and Cb and Cr for 4:4:4 content.*/
+ NS_ASSERTION(yuv_type != YV24 || y_pitch == uv_pitch,
+ "ScaleYCbCrToRGB565 luma stride differs from chroma for 4:4:4 content.");
+ /*We assume we can read outside the bounds of the input, because it makes
+ the code much simpler (and in practice is true: both Theora and VP8 return
+ padded reference frames).
+ In practice, we do not even _have_ the actual bounds of the source, as
+ we are passed a crop rectangle from it, and not the dimensions of the full
+ image.
+ This assertion will not guarantee our out-of-bounds reads are safe, but it
+ should at least catch the simple case of passing in an unpadded buffer.*/
+ NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16,
+ "ScaleYCbCrToRGB565 source image unpadded?");
+ /*The NEON code requires the pointers to be aligned to a 16-byte boundary at
+ the start of each row.
+ This should be true for all of our sources.
+ We could try to fix this up if it's not true by adjusting source_x0, but
+ that would require the mis-alignment to be the same for the U and V
+ planes.*/
+ NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 &&
+ ((y_buf-(uint8_t *)nullptr)&15) == 0 &&
+ ((u_buf-(uint8_t *)nullptr)&15) == 0 &&
+ ((v_buf-(uint8_t *)nullptr)&15) == 0,
+ "ScaleYCbCrToRGB565 source image unaligned");
+ /*We take an area-based approach to pixel coverage to avoid shifting by small
+ amounts (or not so small, when up-scaling or down-scaling by a large
+ factor).
+
+ An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^.
+
+ + = RGB destination locations
+ * = Y' source locations
+ - = Cb, Cr source locations
+
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+ - -
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+ - -
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+
+ So, the coordinates of the upper-left + (first destination site) should
+ be (-0.25,-0.25) in the source Y' coordinate system.
+ Similarly, the coordinates should be (-0.375,-0.375) in the source Cb, Cr
+ coordinate system.
+ Note that the origin and scale of these two coordinate systems is not the
+ same!
+
+ ^JPEG cositing is required for Theora; VP8 doesn't specify cositing rules,
+ but nearly all software converters in existence (at least those that are
+ open source, and many that are not) use JPEG cositing instead of MPEG.*/
+ source_dx_q16 = (source_width<<16) / width;
+ source_x0_q16 = (source_x0<<16)+(source_dx_q16>>1)-0x8000;
+ source_dy_q16 = (source_height<<16) / height;
+ source_y0_q16 = (source_y0<<16)+(source_dy_q16>>1)-0x8000;
+ x_shift = (yuv_type != YV24);
+ y_shift = (yuv_type == YV12);
+ /*These two variables hold the difference between the origins of the Y' and
+ the Cb, Cr coordinate systems, using the scale of the Y' coordinate
+ system.*/
+ source_uv_xoffs_q16 = -(x_shift<<15);
+ source_uv_yoffs_q16 = -(y_shift<<15);
+ /*Compute the range of source rows we'll actually use.
+ This doesn't guarantee we won't read outside this range.*/
+ ymin = source_height >= 0 ? source_y0 : source_y0+source_height-1;
+ ymax = source_height >= 0 ? source_y0+source_height-1 : source_y0;
+ uvmin = ymin>>y_shift;
+ uvmax = ((ymax+1+y_shift)>>y_shift)-1;
+ /*Pick a dithering pattern.
+ The "&3" at the end is just in case RAND_MAX is lying.*/
+ dither = (rand()/(RAND_MAX>>2))&3;
+ /*Nearest-neighbor scaling.*/
+ if (filter == FILTER_NONE) {
+ yuv2rgb565_row_scale_nearest_ctx ctx;
+ yuv2rgb565_row_scale_nearest_func scale_row;
+ int y;
+ /*Add rounding offsets once, in advance.*/
+ source_x0_q16 += 0x8000;
+ source_y0_q16 += 0x8000;
+ source_uv_xoffs_q16 += (x_shift<<15);
+ source_uv_yoffs_q16 += (y_shift<<15);
+ if (yuv_type == YV12)
+ scale_row = ScaleYCbCr42xToRGB565_Nearest_Row_C;
+ else
+ scale_row = ScaleYCbCr444ToRGB565_Nearest_Row_C;
+ ctx.width = width;
+ ctx.source_x0_q16 = source_x0_q16;
+ ctx.source_dx_q16 = source_dx_q16;
+ ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16;
+ for (y=0; y<height; y++) {
+ int source_y;
+ ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch);
+ source_y = source_y0_q16>>16;
+ source_y = clamped(source_y, ymin, ymax);
+ ctx.y_row = y_buf + source_y*y_pitch;
+ source_y = (source_y0_q16+source_uv_yoffs_q16)>>(16+y_shift);
+ source_y = clamped(source_y, uvmin, uvmax);
+ source_y0_q16 += source_dy_q16;
+ ctx.u_row = u_buf + source_y*uv_pitch;
+ ctx.v_row = v_buf + source_y*uv_pitch;
+ (*scale_row)(&ctx, dither);
+ dither ^= 2;
+ }
+ }
+ /*Bilinear scaling.*/
+ else {
+ yuv2rgb565_row_scale_bilinear_ctx ctx;
+ yuv2rgb565_row_scale_bilinear_func scale_row;
+ int uvxscale_min;
+ int uvxscale_max;
+ int uvyscale_min;
+ int uvyscale_max;
+ int y;
+ /*Check how close the chroma scaling is to unity.
+ If it's close enough, we can get away with nearest-neighbor chroma
+ sub-sampling, and only doing bilinear on luma.
+ If a given axis is subsampled, we use bounds on the luma step of
+ [0.67...2], which is equivalent to scaling chroma by [1...3].
+ If it's not subsampled, we use bounds of [0.5...1.33], which is
+ equivalent to scaling chroma by [0.75...2].
+ The lower bound is chosen as a trade-off between speed and how terrible
+ nearest neighbor looks when upscaling.*/
+# define CHROMA_NEAREST_SUBSAMP_STEP_MIN 0xAAAA
+# define CHROMA_NEAREST_NORMAL_STEP_MIN 0x8000
+# define CHROMA_NEAREST_SUBSAMP_STEP_MAX 0x20000
+# define CHROMA_NEAREST_NORMAL_STEP_MAX 0x15555
+ uvxscale_min = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvxscale_max = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ uvyscale_min = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvyscale_max = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ if (uvxscale_min <= abs(source_dx_q16)
+ && abs(source_dx_q16) <= uvxscale_max
+ && uvyscale_min <= abs(source_dy_q16)
+ && abs(source_dy_q16) <= uvyscale_max) {
+ /*Add the rounding offsets now.*/
+ source_uv_xoffs_q16 += 1<<(15+x_shift);
+ source_uv_yoffs_q16 += 1<<(15+y_shift);
+ if (yuv_type != YV24) {
+ scale_row =
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+ supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON :
+# endif
+ ScaleYCbCr42xToRGB565_BilinearY_Row_C;
+ }
+ else
+ scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C;
+ }
+ else {
+ if (yuv_type == YV12)
+ scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C;
+ else if (yuv_type == YV16)
+ scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C;
+ else
+ scale_row = ScaleYCbCr444ToRGB565_Bilinear_Row_C;
+ }
+ ctx.width = width;
+ ctx.y_pitch = y_pitch;
+ ctx.source_x0_q16 = source_x0_q16;
+ ctx.source_dx_q16 = source_dx_q16;
+ ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16;
+ ctx.uv_pitch = uv_pitch;
+ for (y=0; y<height; y++) {
+ int source_y;
+ int yweight;
+ int uvweight;
+ ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch);
+ source_y = (source_y0_q16+128)>>16;
+ yweight = ((source_y0_q16+128)>>8)&0xFF;
+ if (source_y < ymin) {
+ source_y = ymin;
+ yweight = 0;
+ }
+ if (source_y > ymax) {
+ source_y = ymax;
+ yweight = 0;
+ }
+ ctx.y_row = y_buf + source_y*y_pitch;
+ source_y = source_y0_q16+source_uv_yoffs_q16+(128<<y_shift);
+ source_y0_q16 += source_dy_q16;
+ uvweight = source_y>>(8+y_shift)&0xFF;
+ source_y >>= 16+y_shift;
+ if (source_y < uvmin) {
+ source_y = uvmin;
+ uvweight = 0;
+ }
+ if (source_y > uvmax) {
+ source_y = uvmax;
+ uvweight = 0;
+ }
+ ctx.u_row = u_buf + source_y*uv_pitch;
+ ctx.v_row = v_buf + source_y*uv_pitch;
+ ctx.y_yweight = yweight;
+ ctx.uv_yweight = uvweight;
+ (*scale_row)(&ctx, dither);
+ dither ^= 2;
+ }
+ }
+}
+
+bool IsScaleYCbCrToRGB565Fast(int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ YUVType yuv_type,
+ ScaleFilter filter)
+{
+ // Very fast.
+ if (width <= 0 || height <= 0)
+ return true;
+# if defined(MOZILLA_MAY_SUPPORT_NEON)
+ if (filter != FILTER_NONE) {
+ int source_dx_q16;
+ int source_dy_q16;
+ int uvxscale_min;
+ int uvxscale_max;
+ int uvyscale_min;
+ int uvyscale_max;
+ source_dx_q16 = (source_width<<16) / width;
+ source_dy_q16 = (source_height<<16) / height;
+ uvxscale_min = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvxscale_max = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ uvyscale_min = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvyscale_max = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ if (uvxscale_min <= abs(source_dx_q16)
+ && abs(source_dx_q16) <= uvxscale_max
+ && uvyscale_min <= abs(source_dy_q16)
+ && abs(source_dy_q16) <= uvyscale_max) {
+ if (yuv_type != YV24)
+ return supports_neon();
+ }
+ }
+# endif
+ return false;
+}
+
+
+
+void yuv_to_rgb565_row_c(uint16_t *dst,
+ const uint8_t *y,
+ const uint8_t *u,
+ const uint8_t *v,
+ int x_shift,
+ int pic_x,
+ int pic_width)
+{
+ int x;
+ for (x = 0; x < pic_width; x++)
+ {
+ dst[x] = yu2rgb565(y[pic_x+x],
+ u[(pic_x+x)>>x_shift],
+ v[(pic_x+x)>>x_shift],
+ 2); // Disable dithering for now.
+ }
+}
+
+void ConvertYCbCrToRGB565(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type)
+{
+ int x_shift;
+ int y_shift;
+ x_shift = yuv_type != YV24;
+ y_shift = yuv_type == YV12;
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+ if (yuv_type != YV24 && supports_neon())
+ {
+ for (int i = 0; i < pic_height; i++) {
+ int yoffs;
+ int uvoffs;
+ yoffs = y_pitch * (pic_y+i) + pic_x;
+ uvoffs = uv_pitch * ((pic_y+i)>>y_shift) + (pic_x>>x_shift);
+ yuv42x_to_rgb565_row_neon((uint16_t*)(rgb_buf + rgb_pitch * i),
+ y_buf + yoffs,
+ u_buf + uvoffs,
+ v_buf + uvoffs,
+ pic_width,
+ pic_x&x_shift);
+ }
+ }
+ else
+# endif
+ {
+ for (int i = 0; i < pic_height; i++) {
+ int yoffs;
+ int uvoffs;
+ yoffs = y_pitch * (pic_y+i);
+ uvoffs = uv_pitch * ((pic_y+i)>>y_shift);
+ yuv_to_rgb565_row_c((uint16_t*)(rgb_buf + rgb_pitch * i),
+ y_buf + yoffs,
+ u_buf + uvoffs,
+ v_buf + uvoffs,
+ x_shift,
+ pic_x,
+ pic_width);
+ }
+ }
+}
+
+bool IsConvertYCbCrToRGB565Fast(int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ YUVType yuv_type)
+{
+# if defined(MOZILLA_MAY_SUPPORT_NEON)
+ return (yuv_type != YV24 && supports_neon());
+# else
+ return false;
+# endif
+}
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // HAVE_YCBCR_TO_RGB565
diff --git a/gfx/ycbcr/ycbcr_to_rgb565.h b/gfx/ycbcr/ycbcr_to_rgb565.h
new file mode 100644
index 0000000000..7793962b5c
--- /dev/null
+++ b/gfx/ycbcr/ycbcr_to_rgb565.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef MEDIA_BASE_YCBCR_TO_RGB565_H_
+#define MEDIA_BASE_YCBCR_TO_RGB565_H_
+#include "yuv_convert.h"
+#include "mozilla/arm.h"
+
+// It's currently only worth including this if we have NEON support.
+#if defined(__arm__) && defined(MOZILLA_MAY_SUPPORT_NEON)
+#define HAVE_YCBCR_TO_RGB565 1
+#endif
+
+namespace mozilla {
+
+namespace gfx {
+
+#ifdef HAVE_YCBCR_TO_RGB565
+// Convert a frame of YUV to 16 bit RGB565.
+void ConvertYCbCrToRGB565(const uint8_t* yplane,
+ const uint8_t* uplane,
+ const uint8_t* vplane,
+ uint8_t* rgbframe,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+// Used to test if we have an accelerated version.
+bool IsConvertYCbCrToRGB565Fast(int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ YUVType yuv_type);
+
+// Scale a frame of YUV to 16 bit RGB565.
+void ScaleYCbCrToRGB565(const uint8_t *yplane,
+ const uint8_t *uplane,
+ const uint8_t *vplane,
+ uint8_t *rgbframe,
+ int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ ScaleFilter filter);
+
+// Used to test if we have an accelerated version.
+bool IsScaleYCbCrToRGB565Fast(int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ YUVType yuv_type,
+ ScaleFilter filter);
+#endif // HAVE_YCBCR_TO_RGB565
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // MEDIA_BASE_YCBCR_TO_RGB565_H_
diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
new file mode 100644
index 0000000000..f7d01a3ef8
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -0,0 +1,577 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This webpage shows layout of YV12 and other YUV formats
+// http://www.fourcc.org/yuv.php
+// The actual conversion is best described here
+// http://en.wikipedia.org/wiki/YUV
+// An article on optimizing YUV conversion using tables instead of multiplies
+// http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
+//
+// YV12 is a full plane of Y and a half height, half width chroma planes
+// YV16 is a full plane of Y and a full height, half width chroma planes
+// YV24 is a full plane of Y and a full height, full width chroma planes
+// Y8 is a full plane of Y and no chroma planes (i.e., monochrome)
+//
+// ARGB pixel format is output, which on little endian is stored as BGRA.
+// The alpha is set to 255, allowing the application to use RGBA or RGB32.
+
+#include "yuv_convert.h"
+
+#include "mozilla/StaticPrefs_gfx.h"
+#include "libyuv.h"
+#include "scale_yuv_argb.h"
+// Header for low level row functions.
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+#include "mozilla/IntegerRange.h"
+
+namespace mozilla {
+
+namespace gfx {
+
+// 16.16 fixed point arithmetic
+const int kFractionBits = 16;
+const int kFractionMax = 1 << kFractionBits;
+const int kFractionMask = ((1 << kFractionBits) - 1);
+
+// clang-format off
+
+libyuv::FourCC FourCCFromYUVType(YUVType aYUVType) {
+ switch (aYUVType) {
+ case YV24: return libyuv::FOURCC_I444;
+ case YV16: return libyuv::FOURCC_I422;
+ case YV12: return libyuv::FOURCC_I420;
+ case Y8: return libyuv::FOURCC_I400;
+ default: return libyuv::FOURCC_ANY;
+ }
+}
+
+int GBRPlanarToARGB(const uint8_t* src_y, int y_pitch,
+ const uint8_t* src_u, int u_pitch,
+ const uint8_t* src_v, int v_pitch,
+ uint8_t* rgb_buf, int rgb_pitch,
+ int pic_width, int pic_height) {
+ // libyuv has no native conversion function for this
+ // fixme: replace with something less awful
+ for (const auto row : IntegerRange(pic_height)) {
+ for (const auto col : IntegerRange(pic_width)) {
+ rgb_buf[rgb_pitch * row + col * 4 + 0] = src_u[u_pitch * row + col];
+ rgb_buf[rgb_pitch * row + col * 4 + 1] = src_y[y_pitch * row + col];
+ rgb_buf[rgb_pitch * row + col * 4 + 2] = src_v[v_pitch * row + col];
+ rgb_buf[rgb_pitch * row + col * 4 + 3] = 255;
+ }
+ }
+ return 0;
+}
+
+// Convert a frame of YUV to 32 bit ARGB.
+void ConvertYCbCrToRGB32(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf, int pic_x,
+ int pic_y, int pic_width, int pic_height, int y_pitch,
+ int uv_pitch, int rgb_pitch, YUVType yuv_type,
+ YUVColorSpace yuv_color_space,
+ ColorRange color_range) {
+ // Deprecated function's conversion is accurate.
+ // libyuv converion is a bit inaccurate to get performance. It dynamically
+ // calculates RGB from YUV to use simd. In it, signed byte is used for
+ // conversion's coefficient, but it requests 129. libyuv cut 129 to 127. And
+ // only 6 bits are used for a decimal part during the dynamic calculation.
+ //
+ // The function is still fast on some old intel chips.
+ // See Bug 1256475.
+ bool use_deprecated = StaticPrefs::gfx_ycbcr_accurate_conversion() ||
+ (supports_mmx() && supports_sse() && !supports_sse3() &&
+ yuv_color_space == YUVColorSpace::BT601 &&
+ color_range == ColorRange::LIMITED);
+ // The deprecated function only support BT601.
+ // See Bug 1210357.
+ if (yuv_color_space != YUVColorSpace::BT601) {
+ use_deprecated = false;
+ }
+ if (use_deprecated) {
+ ConvertYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, rgb_buf, pic_x, pic_y,
+ pic_width, pic_height, y_pitch, uv_pitch,
+ rgb_pitch, yuv_type);
+ return;
+ }
+
+ decltype(libyuv::I420ToARGBMatrix)* fConvertYUVToARGB = nullptr;
+ const uint8_t* src_y = nullptr;
+ const uint8_t* src_u = nullptr;
+ const uint8_t* src_v = nullptr;
+ const libyuv::YuvConstants* yuv_constant = nullptr;
+
+ switch (yuv_color_space) {
+ case YUVColorSpace::BT2020:
+ yuv_constant = color_range == ColorRange::LIMITED
+ ? &libyuv::kYuv2020Constants
+ : &libyuv::kYuvV2020Constants;
+ break;
+ case YUVColorSpace::BT709:
+ yuv_constant = color_range == ColorRange::LIMITED
+ ? &libyuv::kYuvH709Constants
+ : &libyuv::kYuvF709Constants;
+ break;
+ case YUVColorSpace::Identity:
+ MOZ_ASSERT(yuv_type == YV24, "Identity (aka RGB) with chroma subsampling is unsupported");
+ if (yuv_type == YV24) {
+ break;
+ }
+ [[fallthrough]]; // Assuming BT601 for unsupported input is better than crashing
+ default:
+ MOZ_FALLTHROUGH_ASSERT("Unsupported YUVColorSpace");
+ case YUVColorSpace::BT601:
+ yuv_constant = color_range == ColorRange::LIMITED
+ ? &libyuv::kYuvI601Constants
+ : &libyuv::kYuvJPEGConstants;
+ break;
+ }
+
+ switch (yuv_type) {
+ case YV24: {
+ src_y = y_buf + y_pitch * pic_y + pic_x;
+ src_u = u_buf + uv_pitch * pic_y + pic_x;
+ src_v = v_buf + uv_pitch * pic_y + pic_x;
+
+ if (yuv_color_space == YUVColorSpace::Identity) {
+ // Special case for RGB image
+ DebugOnly<int> err =
+ GBRPlanarToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch,
+ rgb_buf, rgb_pitch, pic_width, pic_height);
+ MOZ_ASSERT(!err);
+ return;
+ }
+
+ fConvertYUVToARGB = libyuv::I444ToARGBMatrix;
+ break;
+ }
+ case YV16: {
+ src_y = y_buf + y_pitch * pic_y + pic_x;
+ src_u = u_buf + uv_pitch * pic_y + pic_x / 2;
+ src_v = v_buf + uv_pitch * pic_y + pic_x / 2;
+
+ fConvertYUVToARGB = libyuv::I422ToARGBMatrix;
+ break;
+ }
+ case YV12: {
+ src_y = y_buf + y_pitch * pic_y + pic_x;
+ src_u = u_buf + (uv_pitch * pic_y + pic_x) / 2;
+ src_v = v_buf + (uv_pitch * pic_y + pic_x) / 2;
+
+ fConvertYUVToARGB = libyuv::I420ToARGBMatrix;
+ break;
+ }
+ case Y8: {
+ src_y = y_buf + y_pitch * pic_y + pic_x;
+ MOZ_ASSERT(u_buf == nullptr);
+ MOZ_ASSERT(v_buf == nullptr);
+
+ if (color_range == ColorRange::LIMITED) {
+ DebugOnly<int> err =
+ libyuv::I400ToARGB(src_y, y_pitch, rgb_buf, rgb_pitch, pic_width,
+ pic_height);
+ MOZ_ASSERT(!err);
+ } else {
+ DebugOnly<int> err =
+ libyuv::J400ToARGB(src_y, y_pitch, rgb_buf, rgb_pitch, pic_width,
+ pic_height);
+ MOZ_ASSERT(!err);
+ }
+
+ return;
+ }
+ default:
+ MOZ_ASSERT_UNREACHABLE("Unsupported YUV type");
+ }
+
+ DebugOnly<int> err =
+ fConvertYUVToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch,
+ rgb_buf, rgb_pitch, yuv_constant, pic_width, pic_height);
+ MOZ_ASSERT(!err);
+}
+
+// Convert a frame of YUV to 32 bit ARGB.
+void ConvertYCbCrToRGB32_deprecated(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+ unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
+ // Test for SSE because the optimized code uses movntq, which is not part of MMX.
+ bool has_sse = supports_mmx() && supports_sse();
+ // There is no optimized YV24 SSE routine so we check for this and
+ // fall back to the C code.
+ has_sse &= yuv_type != YV24;
+ bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
+ int x_width = odd_pic_x ? pic_width - 1 : pic_width;
+
+ for (int y = pic_y; y < pic_height + pic_y; ++y) {
+ uint8_t* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
+ const uint8_t* y_ptr = y_buf + y * y_pitch + pic_x;
+ const uint8_t* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+ const uint8_t* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+
+ if (odd_pic_x) {
+ // Handle the single odd pixel manually and use the
+ // fast routines for the remaining.
+ FastConvertYUVToRGB32Row_C(y_ptr++,
+ u_ptr++,
+ v_ptr++,
+ rgb_row,
+ 1,
+ x_shift);
+ rgb_row += 4;
+ }
+
+ if (has_sse) {
+ FastConvertYUVToRGB32Row(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ x_width);
+ }
+ else {
+ FastConvertYUVToRGB32Row_C(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ x_width,
+ x_shift);
+ }
+ }
+
+ // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+ if (has_sse)
+ EMMS();
+}
+
+// C version does 8 at a time to mimic MMX code
+static void FilterRows_C(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr,
+ int source_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ uint8_t* end = ybuf + source_width;
+ do {
+ ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+ ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+ ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+ ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+ ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+ ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+ ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+ ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+ y0_ptr += 8;
+ y1_ptr += 8;
+ ybuf += 8;
+ } while (ybuf < end);
+}
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+void FilterRows_MMX(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr,
+ int source_width, int source_y_fraction);
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+void FilterRows_SSE2(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr,
+ int source_width, int source_y_fraction);
+#endif
+
+static inline void FilterRows(uint8_t* ybuf, const uint8_t* y0_ptr,
+ const uint8_t* y1_ptr, int source_width,
+ int source_y_fraction) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+ if (mozilla::supports_sse2()) {
+ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ return;
+ }
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+ if (mozilla::supports_mmx()) {
+ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ return;
+ }
+#endif
+
+ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+}
+
+
+// Scale a frame of YUV to 32 bit ARGB.
+void ScaleYCbCrToRGB32(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space,
+ ScaleFilter filter) {
+ bool use_deprecated =
+ StaticPrefs::gfx_ycbcr_accurate_conversion() ||
+#if defined(XP_WIN) && defined(_M_X64)
+ // libyuv does not support SIMD scaling on win 64bit. See Bug 1295927.
+ supports_sse3() ||
+#endif
+ (supports_mmx() && supports_sse() && !supports_sse3());
+ // The deprecated function only support BT601.
+ // See Bug 1210357.
+ if (yuv_color_space != YUVColorSpace::BT601) {
+ use_deprecated = false;
+ }
+ if (use_deprecated) {
+ ScaleYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf,
+ rgb_buf,
+ source_width, source_height,
+ width, height,
+ y_pitch, uv_pitch,
+ rgb_pitch,
+ yuv_type,
+ ROTATE_0,
+ filter);
+ return;
+ }
+
+ DebugOnly<int> err =
+ libyuv::YUVToARGBScale(y_buf, y_pitch,
+ u_buf, uv_pitch,
+ v_buf, uv_pitch,
+ FourCCFromYUVType(yuv_type),
+ yuv_color_space,
+ source_width, source_height,
+ rgb_buf, rgb_pitch,
+ width, height,
+ libyuv::kFilterBilinear);
+ MOZ_ASSERT(!err);
+ return;
+}
+
+// Scale a frame of YUV to 32 bit ARGB.
+void ScaleYCbCrToRGB32_deprecated(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ Rotate view_rotate,
+ ScaleFilter filter) {
+ bool has_mmx = supports_mmx();
+
+ // 4096 allows 3 buffers to fit in 12k.
+ // Helps performance on CPU with 16K L1 cache.
+ // Large enough for 3830x2160 and 30" displays which are 2560x1600.
+ const int kFilterBufferSize = 4096;
+ // Disable filtering if the screen is too big (to avoid buffer overflows).
+ // This should never happen to regular users: they don't have monitors
+ // wider than 4096 pixels.
+ // TODO(fbarchard): Allow rotated videos to filter.
+ if (source_width > kFilterBufferSize || view_rotate)
+ filter = FILTER_NONE;
+
+ unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+ // Diagram showing origin and direction of source sampling.
+ // ->0 4<-
+ // 7 3
+ //
+ // 6 5
+ // ->1 2<-
+ // Rotations that start at right side of image.
+ if ((view_rotate == ROTATE_180) ||
+ (view_rotate == ROTATE_270) ||
+ (view_rotate == MIRROR_ROTATE_0) ||
+ (view_rotate == MIRROR_ROTATE_90)) {
+ y_buf += source_width - 1;
+ u_buf += source_width / 2 - 1;
+ v_buf += source_width / 2 - 1;
+ source_width = -source_width;
+ }
+ // Rotations that start at bottom of image.
+ if ((view_rotate == ROTATE_90) ||
+ (view_rotate == ROTATE_180) ||
+ (view_rotate == MIRROR_ROTATE_90) ||
+ (view_rotate == MIRROR_ROTATE_180)) {
+ y_buf += (source_height - 1) * y_pitch;
+ u_buf += ((source_height >> y_shift) - 1) * uv_pitch;
+ v_buf += ((source_height >> y_shift) - 1) * uv_pitch;
+ source_height = -source_height;
+ }
+
+ // Handle zero sized destination.
+ if (width == 0 || height == 0)
+ return;
+ int source_dx = source_width * kFractionMax / width;
+ int source_dy = source_height * kFractionMax / height;
+ int source_dx_uv = source_dx;
+
+ if ((view_rotate == ROTATE_90) ||
+ (view_rotate == ROTATE_270)) {
+ int tmp = height;
+ height = width;
+ width = tmp;
+ tmp = source_height;
+ source_height = source_width;
+ source_width = tmp;
+ int original_dx = source_dx;
+ int original_dy = source_dy;
+ source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits;
+ source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits;
+ source_dy = original_dx;
+ if (view_rotate == ROTATE_90) {
+ y_pitch = -1;
+ uv_pitch = -1;
+ source_height = -source_height;
+ } else {
+ y_pitch = 1;
+ uv_pitch = 1;
+ }
+ }
+
+ // Need padding because FilterRows() will write 1 to 16 extra pixels
+ // after the end for SSE2 version.
+ uint8_t yuvbuf[16 + kFilterBufferSize * 3 + 16];
+ uint8_t* ybuf =
+ reinterpret_cast<uint8_t*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
+ uint8_t* ubuf = ybuf + kFilterBufferSize;
+ uint8_t* vbuf = ubuf + kFilterBufferSize;
+ // TODO(fbarchard): Fixed point math is off by 1 on negatives.
+ int yscale_fixed = (source_height << kFractionBits) / height;
+
+ // TODO(fbarchard): Split this into separate function for better efficiency.
+ for (int y = 0; y < height; ++y) {
+ uint8_t* dest_pixel = rgb_buf + y * rgb_pitch;
+ int source_y_subpixel = (y * yscale_fixed);
+ if (yscale_fixed >= (kFractionMax * 2)) {
+ source_y_subpixel += kFractionMax / 2; // For 1/2 or less, center filter.
+ }
+ int source_y = source_y_subpixel >> kFractionBits;
+
+ const uint8_t* y0_ptr = y_buf + source_y * y_pitch;
+ const uint8_t* y1_ptr = y0_ptr + y_pitch;
+
+ const uint8_t* u0_ptr = u_buf + (source_y >> y_shift) * uv_pitch;
+ const uint8_t* u1_ptr = u0_ptr + uv_pitch;
+ const uint8_t* v0_ptr = v_buf + (source_y >> y_shift) * uv_pitch;
+ const uint8_t* v1_ptr = v0_ptr + uv_pitch;
+
+ // vertical scaler uses 16.8 fixed point
+ int source_y_fraction = (source_y_subpixel & kFractionMask) >> 8;
+ int source_uv_fraction =
+ ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
+
+ const uint8_t* y_ptr = y0_ptr;
+ const uint8_t* u_ptr = u0_ptr;
+ const uint8_t* v_ptr = v0_ptr;
+ // Apply vertical filtering if necessary.
+ // TODO(fbarchard): Remove memcpy when not necessary.
+ if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
+ if (yscale_fixed != kFractionMax &&
+ source_y_fraction && ((source_y + 1) < source_height)) {
+ FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ } else {
+ memcpy(ybuf, y0_ptr, source_width);
+ }
+ y_ptr = ybuf;
+ ybuf[source_width] = ybuf[source_width-1];
+ int uv_source_width = (source_width + 1) / 2;
+ if (yscale_fixed != kFractionMax &&
+ source_uv_fraction &&
+ (((source_y >> y_shift) + 1) < (source_height >> y_shift))) {
+ FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);
+ FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);
+ } else {
+ memcpy(ubuf, u0_ptr, uv_source_width);
+ memcpy(vbuf, v0_ptr, uv_source_width);
+ }
+ u_ptr = ubuf;
+ v_ptr = vbuf;
+ ubuf[uv_source_width] = ubuf[uv_source_width - 1];
+ vbuf[uv_source_width] = vbuf[uv_source_width - 1];
+ }
+ if (source_dx == kFractionMax) { // Not scaled
+ FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width);
+ } else if (filter & FILTER_BILINEAR_H) {
+ LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ } else {
+// Specialized scalers and rotation.
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) && !defined(__clang__)
+ if(mozilla::supports_sse()) {
+ if (width == (source_width * 2)) {
+ DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width);
+ } else if ((source_dx & kFractionMask) == 0) {
+ // Scaling by integer scale factor. ie half.
+ ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width,
+ source_dx >> kFractionBits);
+ } else if (source_dx_uv == source_dx) { // Not rotated.
+ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ } else {
+ RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width,
+ source_dx >> kFractionBits,
+ source_dx_uv >> kFractionBits);
+ }
+ }
+ else {
+ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ }
+#else
+ (void)source_dx_uv;
+ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+#endif
+ }
+ }
+ // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
+ if (has_mmx)
+ EMMS();
+}
+void ConvertI420AlphaToARGB32(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* argb_buf,
+ int pic_width,
+ int pic_height,
+ int ya_pitch,
+ int uv_pitch,
+ int argb_pitch) {
+
+ // The downstream graphics stack expects an attenuated input, hence why the
+ // attenuation parameter is set.
+ DebugOnly<int> err = libyuv::I420AlphaToARGB(y_buf, ya_pitch,
+ u_buf, uv_pitch,
+ v_buf, uv_pitch,
+ a_buf, ya_pitch,
+ argb_buf, argb_pitch,
+ pic_width, pic_height, 1);
+ MOZ_ASSERT(!err);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
new file mode 100644
index 0000000000..3368890819
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// clang-format off
+
+#ifndef MEDIA_BASE_YUV_CONVERT_H_
+#define MEDIA_BASE_YUV_CONVERT_H_
+
+#include "chromium_types.h"
+#include "mozilla/gfx/Types.h"
+
+namespace mozilla {
+
+namespace gfx {
+
+// Type of YUV surface.
+// The value of these enums matter as they are used to shift vertical indices.
+enum YUVType {
+ YV12 = 0, // YV12 is half width and half height chroma channels.
+ YV16 = 1, // YV16 is half width and full height chroma channels.
+ YV24 = 2, // YV24 is full width and full height chroma channels.
+ Y8 = 3 // Y8 is monochrome: no chroma channels.
+};
+
+// Mirror means flip the image horizontally, as in looking in a mirror.
+// Rotate happens after mirroring.
+enum Rotate {
+ ROTATE_0, // Rotation off.
+ ROTATE_90, // Rotate clockwise.
+ ROTATE_180, // Rotate upside down.
+ ROTATE_270, // Rotate counter clockwise.
+ MIRROR_ROTATE_0, // Mirror horizontally.
+ MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
+ MIRROR_ROTATE_180, // Mirror vertically.
+ MIRROR_ROTATE_270 // Transpose.
+};
+
+// Filter affects how scaling looks.
+enum ScaleFilter {
+ FILTER_NONE = 0, // No filter (point sampled).
+ FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
+ FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
+ FILTER_BILINEAR = 3 // Bilinear filter.
+};
+
+// Convert a frame of YUV to 32 bit ARGB.
+// Pass in YV16/YV12 depending on source format
+void ConvertYCbCrToRGB32(const uint8_t* yplane,
+ const uint8_t* uplane,
+ const uint8_t* vplane,
+ uint8_t* rgbframe,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space,
+ ColorRange color_range);
+
+void ConvertYCbCrToRGB32_deprecated(const uint8_t* yplane,
+ const uint8_t* uplane,
+ const uint8_t* vplane,
+ uint8_t* rgbframe,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+// Scale a frame of YUV to 32 bit ARGB.
+// Supports rotation and mirroring.
+void ScaleYCbCrToRGB32(const uint8_t* yplane,
+ const uint8_t* uplane,
+ const uint8_t* vplane,
+ uint8_t* rgbframe,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space,
+ ScaleFilter filter);
+
+void ScaleYCbCrToRGB32_deprecated(const uint8_t* yplane,
+ const uint8_t* uplane,
+ const uint8_t* vplane,
+ uint8_t* rgbframe,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ Rotate view_rotate,
+ ScaleFilter filter);
+
+void ConvertI420AlphaToARGB32(const uint8_t* yplane,
+ const uint8_t* uplane,
+ const uint8_t* vplane,
+ const uint8_t* aplane,
+ uint8_t* argbframe,
+ int pic_width,
+ int pic_height,
+ int yastride,
+ int uvstride,
+ int argbstride);
+
+} // namespace gfx
+} // namespace mozilla
+
+#endif // MEDIA_BASE_YUV_CONVERT_H_
diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp
new file mode 100644
index 0000000000..7d58fa475d
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_arm.cpp
@@ -0,0 +1,232 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
+
+#include "yuv_convert.h"
+#include "ycbcr_to_rgb565.h"
+
+
+
+#ifdef HAVE_YCBCR_TO_RGB565
+
+namespace mozilla {
+
+namespace gfx {
+
+# if defined(MOZILLA_MAY_SUPPORT_NEON)
+# if defined(__clang__)
+void __attribute((noinline))
+# else
+void __attribute((noinline,optimize("-fomit-frame-pointer")))
+# endif
+ yuv42x_to_rgb565_row_neon(uint16_t *dst,
+ const uint8_t *y,
+ const uint8_t *u,
+ const uint8_t *v,
+ int n,
+ int oddflag)
+{
+ static __attribute__((aligned(16))) uint16_t acc_r[8] = {
+ 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
+ };
+ static __attribute__((aligned(16))) uint16_t acc_g[8] = {
+ 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
+ };
+ static __attribute__((aligned(16))) uint16_t acc_b[8] = {
+ 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
+ };
+ /*
+ * Registers:
+ * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data
+ * q2 : d4, d5 - are used for storing converted RGB data
+ * q3 : d6, d7 - are used for temporary storage
+ *
+ * q4-q7 - reserved
+ *
+ * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data
+ * q10 : d20, d21
+ * q11 : d22, d23
+ * q12 : d24, d25
+ * q13 : d26, d27
+ * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154)
+ */
+ asm volatile (
+".fpu neon\n"
+/* Allow to build on targets not supporting neon, and force the object file
+ * target to avoid bumping the final binary target */
+".arch armv7-a\n"
+".object_arch armv4t\n"
+".macro convert_macroblock size\n"
+/* load up to 16 source pixels */
+ ".if \\size == 16\n"
+ "pld [%[y], #64]\n"
+ "pld [%[u], #64]\n"
+ "pld [%[v], #64]\n"
+ "vld1.8 {d1}, [%[y]]!\n"
+ "vld1.8 {d3}, [%[y]]!\n"
+ "vld1.8 {d0}, [%[u]]!\n"
+ "vld1.8 {d2}, [%[v]]!\n"
+ ".elseif \\size == 8\n"
+ "vld1.8 {d1}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d0[1]}, [%[u]]!\n"
+ "vld1.8 {d0[2]}, [%[u]]!\n"
+ "vld1.8 {d0[3]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ "vld1.8 {d2[1]}, [%[v]]!\n"
+ "vld1.8 {d2[2]}, [%[v]]!\n"
+ "vld1.8 {d2[3]}, [%[v]]!\n"
+ ".elseif \\size == 4\n"
+ "vld1.8 {d1[0]}, [%[y]]!\n"
+ "vld1.8 {d1[1]}, [%[y]]!\n"
+ "vld1.8 {d1[2]}, [%[y]]!\n"
+ "vld1.8 {d1[3]}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d0[1]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ "vld1.8 {d2[1]}, [%[v]]!\n"
+ ".elseif \\size == 2\n"
+ "vld1.8 {d1[0]}, [%[y]]!\n"
+ "vld1.8 {d1[1]}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ ".elseif \\size == 1\n"
+ "vld1.8 {d1[0]}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ ".else\n"
+ ".error \"unsupported macroblock size\"\n"
+ ".endif\n"
+
+ /* d1 - Y data (first 8 bytes) */
+ /* d3 - Y data (next 8 bytes) */
+ /* d0 - U data, d2 - V data */
+
+ /* split even and odd Y color components */
+ "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */
+ /* clip upper and lower boundaries */
+ "vqadd.u8 q0, q0, q4\n"
+ "vqadd.u8 q1, q1, q4\n"
+ "vqsub.u8 q0, q0, q5\n"
+ "vqsub.u8 q1, q1, q5\n"
+
+ "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */
+
+ "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */
+ "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */
+
+ "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
+ "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */
+ "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */
+ "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
+ "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */
+ "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */
+ "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
+ "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */
+ "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */
+
+ "vhsub.s16 q3, q8, q10\n" /* calculate even red components */
+ "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */
+ "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */
+ "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */
+
+ "vhadd.s16 q3, q8, q11\n" /* calculate even green components */
+ "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */
+ "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */
+ "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */
+
+ "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */
+ "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */
+ "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */
+ "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */
+
+ "vzip.8 d0, d3\n" /* join even and odd red components */
+ "vzip.8 d1, d4\n" /* join even and odd green components */
+ "vzip.8 d2, d5\n" /* join even and odd blue components */
+
+ "vshll.u8 q3, d0, #8\n\t"
+ "vshll.u8 q8, d1, #8\n\t"
+ "vshll.u8 q9, d2, #8\n\t"
+ "vsri.u16 q3, q8, #5\t\n"
+ "vsri.u16 q3, q9, #11\t\n"
+ /* store pixel data to memory */
+ ".if \\size == 16\n"
+ " vst1.16 {d6, d7}, [%[dst]]!\n"
+ " vshll.u8 q3, d3, #8\n\t"
+ " vshll.u8 q8, d4, #8\n\t"
+ " vshll.u8 q9, d5, #8\n\t"
+ " vsri.u16 q3, q8, #5\t\n"
+ " vsri.u16 q3, q9, #11\t\n"
+ " vst1.16 {d6, d7}, [%[dst]]!\n"
+ ".elseif \\size == 8\n"
+ " vst1.16 {d6, d7}, [%[dst]]!\n"
+ ".elseif \\size == 4\n"
+ " vst1.16 {d6}, [%[dst]]!\n"
+ ".elseif \\size == 2\n"
+ " vst1.16 {d6[0]}, [%[dst]]!\n"
+ " vst1.16 {d6[1]}, [%[dst]]!\n"
+ ".elseif \\size == 1\n"
+ " vst1.16 {d6[0]}, [%[dst]]!\n"
+ ".endif\n"
+ ".endm\n"
+
+ "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */
+ "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */
+ "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */
+ "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */
+
+ "vmov.u8 d26, #16\n"
+ "vmov.u8 d27, #149\n"
+ "vmov.u8 d28, #204\n"
+ "vmov.u8 d29, #50\n"
+ "vmov.u8 d30, #104\n"
+ "vmov.u8 d31, #154\n"
+
+ "cmp %[oddflag], #0\n"
+ "beq 1f\n"
+ "convert_macroblock 1\n"
+ "sub %[n], %[n], #1\n"
+ "1:\n"
+ "subs %[n], %[n], #16\n"
+ "blt 2f\n"
+ "1:\n"
+ "convert_macroblock 16\n"
+ "subs %[n], %[n], #16\n"
+ "bge 1b\n"
+ "2:\n"
+ "tst %[n], #8\n"
+ "beq 3f\n"
+ "convert_macroblock 8\n"
+ "3:\n"
+ "tst %[n], #4\n"
+ "beq 4f\n"
+ "convert_macroblock 4\n"
+ "4:\n"
+ "tst %[n], #2\n"
+ "beq 5f\n"
+ "convert_macroblock 2\n"
+ "5:\n"
+ "tst %[n], #1\n"
+ "beq 6f\n"
+ "convert_macroblock 1\n"
+ "6:\n"
+ ".purgem convert_macroblock\n"
+ : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
+ : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
+ [oddflag] "r" (oddflag)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+ );
+}
+# endif // MOZILLA_MAY_SUPPORT_NEON
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // HAVE_YCBCR_TO_RGB565
diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
new file mode 100644
index 0000000000..797b032f79
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <mmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// MMX version does 8 pixels at a time.
+void FilterRows_MMX(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m64 zero = _mm_setzero_si64();
+ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+ do {
+ __m64 y0 = *y0_ptr64++;
+ __m64 y1 = *y1_ptr64++;
+ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+ y0 = _mm_unpacklo_pi8(y0, zero);
+ y1 = _mm_unpacklo_pi8(y1, zero);
+ y0 = _mm_mullo_pi16(y0, y0_fraction);
+ y1 = _mm_mullo_pi16(y1, y1_fraction);
+ y2 = _mm_mullo_pi16(y2, y0_fraction);
+ y3 = _mm_mullo_pi16(y3, y1_fraction);
+ y0 = _mm_add_pi16(y0, y1);
+ y2 = _mm_add_pi16(y2, y3);
+ y0 = _mm_srli_pi16(y0, 8);
+ y2 = _mm_srli_pi16(y2, 8);
+ y0 = _mm_packs_pu16(y0, y2);
+ *dest64++ = y0;
+ } while (dest64 < end64);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
new file mode 100644
index 0000000000..b5a84c908d
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <emmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// SSE2 version does 16 pixels at a time.
+void FilterRows_SSE2(uint8_t* ybuf, const uint8_t* y0_ptr, const uint8_t* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+ do {
+ __m128i y0 = _mm_loadu_si128(y0_ptr128);
+ __m128i y1 = _mm_loadu_si128(y1_ptr128);
+ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ y0 = _mm_mullo_epi16(y0, y0_fraction);
+ y1 = _mm_mullo_epi16(y1, y1_fraction);
+ y2 = _mm_mullo_epi16(y2, y0_fraction);
+ y3 = _mm_mullo_epi16(y3, y1_fraction);
+ y0 = _mm_add_epi16(y0, y1);
+ y2 = _mm_add_epi16(y2, y3);
+ y0 = _mm_srli_epi16(y0, 8);
+ y2 = _mm_srli_epi16(y2, 8);
+ y0 = _mm_packus_epi16(y0, y2);
+ *dest128++ = y0;
+ ++y0_ptr128;
+ ++y1_ptr128;
+ } while (dest128 < end128);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
new file mode 100644
index 0000000000..95ad16fb71
--- /dev/null
+++ b/gfx/ycbcr/yuv_row.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// yuv_row internal functions to handle YUV conversion and scaling to RGB.
+// These functions are used from both yuv_convert.cc and yuv_scale.cc.
+
+// TODO(fbarchard): Write function that can handle rotation and scaling.
+
+#ifndef MEDIA_BASE_YUV_ROW_H_
+#define MEDIA_BASE_YUV_ROW_H_
+
+#include "chromium_types.h"
+
+extern "C" {
+// Can only do 1x.
+// This is the second fastest of the scalers.
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width);
+
+void FastConvertYUVToRGB32Row_C(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ unsigned int x_shift);
+
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width);
+
+// Can do 1x, half size or any scale down by an integer amount.
+// Step can be negative (mirroring, rotate 180).
+// This is the third fastest of the scalers.
+// Only defined on Windows x86-32.
+void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int step);
+
+// Rotate is like Convert, but applies different step to Y versus U and V.
+// This allows rotation by 90 or 270, by stepping by stride.
+// This is the forth fastest of the scalers.
+// Only defined on Windows x86-32.
+void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int ystep,
+ int uvstep);
+
+// Doubler does 4 pixels at a time. Each pixel is replicated.
+// This is the fastest of the scalers.
+// Only defined on Windows x86-32.
+void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width);
+
+// Handles arbitrary scaling up or down.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_C(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+
+// Handles arbitrary scaling up or down with bilinear filtering.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+// This is the slowest of the scalers.
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+
+
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#endif
+#elif defined(__GNUC__) || defined(__clang__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+#else
+#define SIMD_ALIGNED(var) var
+#endif
+
+extern SIMD_ALIGNED(const int16_t kCoefficientsRgbY[768][4]);
+
+// x64 uses MMX2 (SSE) so emms is not required.
+// Warning C4799: function has no EMMS instruction.
+// EMMS() is slow and should be called by the calling function once per image.
+#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
+#if defined(_MSC_VER)
+#define EMMS() __asm emms
+#pragma warning(disable: 4799)
+#else
+#define EMMS() asm("emms")
+#endif
+#else
+#define EMMS() ((void)0)
+#endif
+
+} // extern "C"
+
+#endif // MEDIA_BASE_YUV_ROW_H_
diff --git a/gfx/ycbcr/yuv_row_arm.s b/gfx/ycbcr/yuv_row_arm.s
new file mode 100644
index 0000000000..6a6c81beeb
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_arm.s
@@ -0,0 +1,304 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+ .arch armv7-a
+ .fpu neon
+/* Allow to build on targets not supporting neon, and force the object file
+ * target to avoid bumping the final binary target */
+ .object_arch armv4t
+ .text
+ .align
+
+ .balign 64
+YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
+ .short -14240
+ .short -14240+384
+ .short 8672
+ .short 8672+192
+ .short -17696
+ .short -17696+384
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
+ .short -14240+128
+ .short -14240+256
+ .short 8672+64
+ .short 8672+128
+ .short -17696+128
+ .short -17696+256
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
+ .short -14240+256
+ .short -14240+128
+ .short 8672+128
+ .short 8672+64
+ .short -17696+256
+ .short -17696+128
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
+ .short -14240+384
+ .short -14240
+ .short 8672+192
+ .short 8672
+ .short -17696+384
+ .short -17696
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+
+@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+@
+@ ctx = {
+@ uint16_t *rgb_row; /*r0*/
+@ const uint8_t *y_row; /*r1*/
+@ const uint8_t *u_row; /*r2*/
+@ const uint8_t *v_row; /*r3*/
+@ int y_yweight; /*r4*/
+@ int y_pitch; /*r5*/
+@ int width; /*r6*/
+@ int source_x0_q16; /*r7*/
+@ int source_dx_q16; /*r8*/
+@ int source_uv_xoffs_q16; /*r9*/
+@ };
+ .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+ .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
+ .balign 64
+ .fnstart
+ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
+ STMFD r13!,{r4-r9,r14} @ 8 words.
+ ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
+ VPUSH {Q4-Q7} @ 16 words.
+ ADD r14,r14,r1, LSL #4 @ Select the dither table to use
+ LDMIA r0, {r0-r9}
+ @ Set up image index registers.
+ ADD r12,r8, r8
+ VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16
+ VDUP.32 D17,r12
+ ADD r12,r12,r12
+ VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16
+ VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16
+ ADD r12,r12,r12
+ VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16
+ VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16
+ CMP r8, #0 @ If source_dx_q16 is negative...
+ VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16
+ ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block
+ VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
+ SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use)
+ VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
+ VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16
+ VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
+ VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
+ VLD1.64 {D30,D31},[r14,:128] @ Load some constants
+ VMOV.I8 D28,#52
+ VMOV.I8 D29,#129
+ @ The basic idea here is to do aligned loads of a block of data and then
+ @ index into it using VTBL to extract the data from the source X
+ @ coordinate corresponding to each destination pixel.
+ @ This is significantly less code and significantly fewer cycles than doing
+ @ a series of single-lane loads, but it means that the X step between
+ @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
+ @ that we could read 8 pixels from a single aligned 32-byte block of data.
+ @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
+ @ separated into even pixels and odd pixels to make extracting offsets and
+ @ weights easier.
+ @ We then pull out two bytes from the middle of each coordinate: the top
+ @ byte corresponds to the integer part of the X coordinate, and the bottom
+ @ byte corresponds to the weight to use for bilinear blending.
+ @ These are separated out into different registers with VTRN.
+ @ Then by subtracting the integer X coordinate of the first pixel in the
+ @ data block we loaded, we produce an index register suitable for use by
+ @ VTBL.
+s42xbily_neon_loop:
+ @ Load the Y' data.
+ MOV r12,r7, ASR #16
+ VRSHRN.S32 D16,Q0, #8
+ AND r12,r12,#~15 @ Read 16-byte aligned blocks
+ VDUP.I8 D20,r12
+ ADD r12,r1, r12 @ r12 = y_row+(source_x&~7)
+ VRSHRN.S32 D17,Q1, #8
+ PLD [r12,#64]
+ VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row
+ ADD r14,r7, r8, LSL #3
+ VRSHRN.S32 D18,Q2, #8
+ MOV r14,r14,ASR #16
+ VRSHRN.S32 D19,Q3, #8
+ AND r14,r14,#~15 @ Read 16-byte aligned blocks
+ VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row
+ PLD [r12,#64]
+ VDUP.I8 D21,r14
+ ADD r14,r1, r14 @ r14 = y_row+(source_x&~7)
+ VMOV.I8 Q13,#1
+ PLD [r14,#64]
+ VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+ @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+ VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded.
+ @ First 8 Y' pixels
+ VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x
+ VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x
+ VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x
+ VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1
+ VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1
+ @ Next 8 Y' pixels
+ VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row
+ VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row
+ PLD [r14,#64]
+ VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x
+ VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x
+ VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1
+ VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1
+ @ Blend Y'.
+ VDUP.I16 Q9, r4 @ Load the y weights.
+ VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a
+ VSUBL.U8 Q5, D25,D21
+ VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b
+ VSUBL.U8 Q7, D27,D23
+ VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight
+ VMUL.S16 Q5, Q5, Q9
+ VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight
+ VMUL.S16 Q7, Q7, Q9
+ VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits.
+ VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW.
+ VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8
+ VRSHRN.S16 D9, Q5, #8
+ VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8
+ VRSHRN.S16 D13,Q7, #8
+ VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8)
+ VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8)
+ VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a
+ VSUBL.U8 Q5, D23,D21
+ VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight
+ VMUL.S16 Q5, Q5, Q13
+ VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8
+ ADD r12,r7, r9
+ VRSHRN.S16 D9, Q5, #8
+ MOV r12,r12,ASR #17
+ VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8)
+ @ Start extracting the chroma x coordinates, and load Cb and Cr.
+ AND r12,r12,#~15 @ Read 16-byte aligned blocks
+ VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4
+ ADD r14,r2, r12
+ VADD.I32 Q10,Q0, Q9
+ VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb
+ PLD [r14,#64]
+ VADD.I32 Q11,Q1, Q9
+ ADD r14,r3, r12
+ VADD.I32 Q12,Q2, Q9
+ VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr
+ PLD [r14,#64]
+ VADD.I32 Q13,Q3, Q9
+ VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
+ VRSHRN.S32 D21,Q11,#9
+ VDUP.I8 Q9, r12
+ VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
+ VRSHRN.S32 D23,Q13,#9
+ @ We don't actually need the x weights, but we get them for free.
+ @ Free ALU slot
+ VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+ @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+ VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded.
+ VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x
+ VMOV.I8 D24,#74
+ VTBL.8 D19,{D8, D9, D10,D11},D23
+ VMOV.I8 D26,#102
+ VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x
+ VMOV.I8 D27,#25
+ VTBL.8 D21,{D12,D13,D14,D15},D23
+ @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
+ @ We use VDUP to expand constants, because it's a permute instruction, so
+ @ it can dual issue on the A8.
+ SUBS r6, r6, #16 @ width -= 16
+ VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74
+ VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G
+ VMULL.U8 Q5, D17,D24
+ VDUP.32 Q7, D30[1]
+ VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G
+ VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R
+ VMLSL.U8 Q7, D19,D27
+ VDUP.32 Q12,D30[0]
+ VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R
+ VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B
+ VMLAL.U8 Q12,D21,D26
+ VDUP.32 Q13,D31[0]
+ VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B
+ VMLAL.U8 Q13,D19,D29
+ VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G
+ VMLSL.U8 Q7, D21,D28
+ VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R
+ VADD.S16 Q12,Q5, Q12
+ VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B
+ VQADD.S16 Q13,Q5, Q13
+ VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G
+ VADD.S16 Q7, Q5, Q7
+ @ Push each value to the top of its word and saturate it.
+ VQSHLU.S16 Q11,Q11,#2
+ VQSHLU.S16 Q12,Q12,#2
+ VQSHLU.S16 Q6, Q6, #2
+ VQSHLU.S16 Q7, Q7, #2
+ VQSHLU.S16 Q8, Q8, #2
+ VQSHLU.S16 Q13,Q13,#2
+ @ Merge G and B into R.
+ VSRI.U16 Q11,Q6, #5
+ VSRI.U16 Q12,Q7, #5
+ VSRI.U16 Q11,Q8, #11
+ MOV r14,r8, LSL #4
+ VSRI.U16 Q12,Q13,#11
+ BLT s42xbily_neon_tail
+ VDUP.I32 Q13,r14
+ @ Store the result.
+ VST1.16 {D22,D23,D24,D25},[r0]!
+ BEQ s42xbily_neon_done
+ @ Advance the x coordinates.
+ VADD.I32 Q0, Q0, Q13
+ VADD.I32 Q1, Q1, Q13
+ ADD r7, r14
+ VADD.I32 Q2, Q2, Q13
+ VADD.I32 Q3, Q3, Q13
+ B s42xbily_neon_loop
+s42xbily_neon_tail:
+ @ We have between 1 and 15 pixels left to write.
+ @ -r6 == the number of pixels we need to skip writing.
+ @ Adjust r0 to point to the last one we need to write, because we're going
+ @ to write them in reverse order.
+ ADD r0, r0, r6, LSL #1
+ MOV r14,#-2
+ ADD r0, r0, #30
+ @ Skip past the ones we don't need to write.
+ SUB PC, PC, r6, LSL #2
+ ORR r0, r0, r0
+ VST1.16 {D25[3]},[r0,:16],r14
+ VST1.16 {D25[2]},[r0,:16],r14
+ VST1.16 {D25[1]},[r0,:16],r14
+ VST1.16 {D25[0]},[r0,:16],r14
+ VST1.16 {D24[3]},[r0,:16],r14
+ VST1.16 {D24[2]},[r0,:16],r14
+ VST1.16 {D24[1]},[r0,:16],r14
+ VST1.16 {D24[0]},[r0,:16],r14
+ VST1.16 {D23[3]},[r0,:16],r14
+ VST1.16 {D23[2]},[r0,:16],r14
+ VST1.16 {D23[1]},[r0,:16],r14
+ VST1.16 {D23[0]},[r0,:16],r14
+ VST1.16 {D22[3]},[r0,:16],r14
+ VST1.16 {D22[2]},[r0,:16],r14
+ VST1.16 {D22[1]},[r0,:16],r14
+ VST1.16 {D22[0]},[r0,:16]
+s42xbily_neon_done:
+ VPOP {Q4-Q7} @ 16 words.
+ LDMFD r13!,{r4-r9,PC} @ 8 words.
+ .fnend
+ .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+
+#if defined(__ELF__)&&defined(__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
new file mode 100644
index 0000000000..b986451ec2
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_c.cpp
@@ -0,0 +1,133 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+#define DCHECK(a)
+
+extern "C" {
+
+// C reference code that mimic the YUV assembly.
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+ (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YuvPixel(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ uint8_t* rgb_buf) {
+
+ int b = kCoefficientsRgbY[256+u][0];
+ int g = kCoefficientsRgbY[256+u][1];
+ int r = kCoefficientsRgbY[256+u][2];
+ int a = kCoefficientsRgbY[256+u][3];
+
+ b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+ g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+ r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+ a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+ b = paddsw(b, kCoefficientsRgbY[y][0]);
+ g = paddsw(g, kCoefficientsRgbY[y][1]);
+ r = paddsw(r, kCoefficientsRgbY[y][2]);
+ a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+ b >>= 6;
+ g >>= 6;
+ r >>= 6;
+ a >>= 6;
+
+ *reinterpret_cast<uint32_t*>(rgb_buf) = (packuswb(b)) |
+ (packuswb(g) << 8) |
+ (packuswb(r) << 16) |
+ (packuswb(a) << 24);
+}
+
+void FastConvertYUVToRGB32Row_C(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ unsigned int x_shift) {
+ for (int x = 0; x < width; x += 2) {
+ uint8_t u = u_buf[x >> x_shift];
+ uint8_t v = v_buf[x >> x_shift];
+ uint8_t y0 = y_buf[x];
+ YuvPixel(y0, u, v, rgb_buf);
+ if ((x + 1) < width) {
+ uint8_t y1 = y_buf[x + 1];
+ if (x_shift == 0) {
+ u = u_buf[x + 1];
+ v = v_buf[x + 1];
+ }
+ YuvPixel(y1, u, v, rgb_buf + 4);
+ }
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+}
+
+// 16.16 fixed point is used. A shift by 16 isolates the integer.
+// A shift by 17 is used to further subsample the chrominence channels.
+// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+// for 1/65536 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ for (int i = 0; i < width; i += 2) {
+ int y = y_buf[x >> 16];
+ int u = u_buf[(x >> 17)];
+ int v = v_buf[(x >> 17)];
+ YuvPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y = y_buf[x >> 16];
+ YuvPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+void LinearScaleYUVToRGB32Row_C(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+ for (int i = 0; i < width; i += 2) {
+ int y0 = y_buf[x >> 16];
+ int y1 = y_buf[(x >> 16) + 1];
+ int u0 = u_buf[(x >> 17)];
+ int u1 = u_buf[(x >> 17) + 1];
+ int v0 = v_buf[(x >> 17)];
+ int v1 = v_buf[(x >> 17) + 1];
+ int y_frac = (x & 65535);
+ int uv_frac = ((x >> 1) & 65535);
+ int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+ int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+ YuvPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ y_frac = (x & 65535);
+ y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ YuvPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+} // extern "C"
+
diff --git a/gfx/ycbcr/yuv_row_other.cpp b/gfx/ycbcr/yuv_row_other.cpp
new file mode 100644
index 0000000000..437f90476d
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_other.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+}
diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
new file mode 100644
index 0000000000..c5e55abe4c
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -0,0 +1,914 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+#define DCHECK(a)
+
+extern "C" {
+
+#if defined(ARCH_CPU_X86_64)
+
+// We don't need CPUID guards here, since x86-64 implies SSE2.
+
+// AMD64 ABI uses register paremters.
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf, // rdi
+ const uint8_t* u_buf, // rsi
+ const uint8_t* v_buf, // rdx
+ uint8_t* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile(
+ "jmp 1f\n"
+"0:"
+ "movzb (%[u_buf]),%%r10\n"
+ "add $0x1,%[u_buf]\n"
+ "movzb (%[v_buf]),%%r11\n"
+ "add $0x1,%[v_buf]\n"
+ "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
+ "movzb (%[y_buf]),%%r10\n"
+ "movq 4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n"
+ "movzb 0x1(%[y_buf]),%%r11\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n"
+ "add $0x2,%[y_buf]\n"
+ "movq (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "paddsw %%xmm0,%%xmm3\n"
+ "shufps $0x44,%%xmm3,%%xmm2\n"
+ "psraw $0x6,%%xmm2\n"
+ "packuswb %%xmm2,%%xmm2\n"
+ "movq %%xmm2,0x0(%[rgb_buf])\n"
+ "add $0x8,%[rgb_buf]\n"
+"1:"
+ "sub $0x2,%[width]\n"
+ "jns 0b\n"
+
+"2:"
+ "add $0x1,%[width]\n"
+ "js 3f\n"
+
+ "movzb (%[u_buf]),%%r10\n"
+ "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
+ "movzb (%[v_buf]),%%r10\n"
+ "movq 4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movzb (%[y_buf]),%%r10\n"
+ "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%[rgb_buf])\n"
+"3:"
+ : [y_buf] "+r"(y_buf),
+ [u_buf] "+r"(u_buf),
+ [v_buf] "+r"(v_buf),
+ [rgb_buf] "+r"(rgb_buf),
+ [width] "+r"(width)
+ : [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
+ : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf, // rdi
+ const uint8_t* u_buf, // rsi
+ const uint8_t* v_buf, // rdx
+ uint8_t* rgb_buf, // rcx
+ int width, // r8
+ int source_dx) { // r9
+ asm volatile(
+ "xor %%r11,%%r11\n"
+ "sub $0x2,%[width]\n"
+ "js 1f\n"
+
+"0:"
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+ "movzb (%[u_buf],%%r10,1),%%rax\n"
+ "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
+ "movzb (%[v_buf],%%r10,1),%%rax\n"
+ "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+ "lea (%%r11,%[source_dx]),%%r10\n"
+ "sar $0x10,%%r11\n"
+ "movzb (%[y_buf],%%r11,1),%%rax\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+ "lea (%%r10,%[source_dx]),%%r11\n"
+ "sar $0x10,%%r10\n"
+ "movzb (%[y_buf],%%r10,1),%%rax\n"
+ "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "shufps $0x44,%%xmm2,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movq %%xmm1,0x0(%[rgb_buf])\n"
+ "add $0x8,%[rgb_buf]\n"
+ "sub $0x2,%[width]\n"
+ "jns 0b\n"
+
+"1:"
+ "add $0x1,%[width]\n"
+ "js 2f\n"
+
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+ "movzb (%[u_buf],%%r10,1),%%rax\n"
+ "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
+ "movzb (%[v_buf],%%r10,1),%%rax\n"
+ "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "sar $0x10,%%r11\n"
+ "movzb (%[y_buf],%%r11,1),%%rax\n"
+ "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%[rgb_buf])\n"
+
+"2:"
+ : [rgb_buf] "+r"(rgb_buf),
+ [width] "+r"(width)
+ : [y_buf] "r"(y_buf),
+ [u_buf] "r"(u_buf),
+ [v_buf] "r"(v_buf),
+ [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
+ [source_dx] "r"(static_cast<long>(source_dx))
+ : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ asm volatile(
+ "xor %%r11,%%r11\n" // x = 0
+ "sub $0x2,%[width]\n"
+ "js 2f\n"
+ "cmp $0x20000,%[source_dx]\n" // if source_dx >= 2.0
+ "jl 0f\n"
+ "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
+"0:"
+
+"1:"
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+
+ "movzb (%[u_buf], %%r10, 1), %%r13 \n"
+ "movzb 1(%[u_buf], %%r10, 1), %%r14 \n"
+ "mov %%r11, %%rax \n"
+ "and $0x1fffe, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0x1fffe, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $17, %%r13 \n"
+ "movq 2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n"
+
+ "movzb (%[v_buf], %%r10, 1), %%r13 \n"
+ "movzb 1(%[v_buf], %%r10, 1), %%r14 \n"
+ "mov %%r11, %%rax \n"
+ "and $0x1fffe, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0x1fffe, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $17, %%r13 \n"
+ "movq 4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n"
+
+ "mov %%r11, %%rax \n"
+ "lea (%%r11,%[source_dx]),%%r10\n"
+ "sar $0x10,%%r11\n"
+ "paddsw %%xmm1,%%xmm0\n"
+
+ "movzb (%[y_buf], %%r11, 1), %%r13 \n"
+ "movzb 1(%[y_buf], %%r11, 1), %%r14 \n"
+ "and $0xffff, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0xffff, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $16, %%r13 \n"
+ "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
+
+ "mov %%r10, %%rax \n"
+ "lea (%%r10,%[source_dx]),%%r11\n"
+ "sar $0x10,%%r10\n"
+
+ "movzb (%[y_buf],%%r10,1), %%r13 \n"
+ "movzb 1(%[y_buf],%%r10,1), %%r14 \n"
+ "and $0xffff, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0xffff, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $16, %%r13 \n"
+ "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n"
+
+ "paddsw %%xmm0,%%xmm1\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "shufps $0x44,%%xmm2,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movq %%xmm1,0x0(%[rgb_buf])\n"
+ "add $0x8,%[rgb_buf]\n"
+ "sub $0x2,%[width]\n"
+ "jns 1b\n"
+
+"2:"
+ "add $0x1,%[width]\n"
+ "js 3f\n"
+
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+
+ "movzb (%[u_buf],%%r10,1), %%r13 \n"
+ "movq 2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n"
+
+ "movzb (%[v_buf],%%r10,1), %%r13 \n"
+ "movq 4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
+
+ "paddsw %%xmm1,%%xmm0\n"
+ "sar $0x10,%%r11\n"
+
+ "movzb (%[y_buf],%%r11,1), %%r13 \n"
+ "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
+
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%[rgb_buf])\n"
+
+"3:"
+ : [rgb_buf] "+r"(rgb_buf),
+ [width] "+r"(width)
+ : [y_buf] "r"(y_buf),
+ [u_buf] "r"(u_buf),
+ [v_buf] "r"(v_buf),
+ [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
+ [source_dx] "r"(static_cast<long>(source_dx))
+ : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
+
+// PIC version is slower because less registers are available, so
+// non-PIC is used on platforms where it is possible.
+void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width);
+ asm(
+ ".text\n"
+ ".global FastConvertYUVToRGB32Row_SSE\n"
+ ".type FastConvertYUVToRGB32Row_SSE, @function\n"
+"FastConvertYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "jmp 1f\n"
+
+"0:"
+ "movzbl (%edi),%eax\n"
+ "add $0x1,%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "add $0x1,%esi\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "add $0x2,%edx\n"
+ "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "sub $0x2,%ecx\n"
+ "jns 0b\n"
+
+ "and $0x1,%ecx\n"
+ "je 2f\n"
+
+ "movzbl (%edi),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%esi),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width)
+{
+ if (mozilla::supports_sse()) {
+ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+ return;
+ }
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+
+void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+ asm(
+ ".text\n"
+ ".global ScaleYUVToRGB32Row_SSE\n"
+ ".type ScaleYUVToRGB32Row_SSE, @function\n"
+"ScaleYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "xor %ebx,%ebx\n"
+ "jmp 1f\n"
+
+"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "sub $0x2,%ecx\n"
+ "jns 0b\n"
+
+ "and $0x1,%ecx\n"
+ "je 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+ return;
+ }
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx);
+ asm(
+ ".text\n"
+ ".global LinearScaleYUVToRGB32Row_SSE\n"
+ ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
+"LinearScaleYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x30(%esp),%ebp\n"
+
+ // source_width = width * source_dx + ebx
+ "mov 0x34(%esp), %ecx\n"
+ "imull 0x38(%esp), %ecx\n"
+ "mov %ecx, 0x34(%esp)\n"
+
+ "mov 0x38(%esp), %ecx\n"
+ "xor %ebx,%ebx\n" // x = 0
+ "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+ "jl 1f\n"
+ "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+ "jmp 1f\n"
+
+"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%edi,%eax,1),%ecx\n"
+ "movzbl 1(%edi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
+
+ "mov 0x2c(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
+
+ "cmp 0x34(%esp), %ebx\n"
+ "jge 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
+
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+
+"1:"
+ "cmp 0x34(%esp), %ebx\n"
+ "jl 0b\n"
+ "popa\n"
+ "ret\n"
+
+"2:"
+ "paddsw %mm0, %mm1\n"
+ "psraw $6, %mm1\n"
+ "packuswb %mm1, %mm1\n"
+ "movd %mm1, (%ebp)\n"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+ return;
+ }
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
+
+void PICConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ const int16_t *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+#if defined(XP_MACOSX)
+"_PICConvertYUVToRGB32Row_SSE:\n"
+#else
+"PICConvertYUVToRGB32Row_SSE:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x38(%esp),%ecx\n"
+
+ "jmp 1f\n"
+
+"0:"
+ "movzbl (%edi),%eax\n"
+ "add $0x1,%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "add $0x1,%esi\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw 4096(%ecx,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "add $0x2,%edx\n"
+ "movq 0(%ecx,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "subl $0x2,0x34(%esp)\n"
+ "jns 0b\n"
+
+ "andl $0x1,0x34(%esp)\n"
+ "je 2f\n"
+
+ "movzbl (%edi),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "movzbl (%esi),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width)
+{
+ if (mozilla::supports_sse()) {
+ PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+ &kCoefficientsRgbY[0][0]);
+ return;
+ }
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void PICScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx,
+ const int16_t *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+#if defined(XP_MACOSX)
+"_PICScaleYUVToRGB32Row_SSE:\n"
+#else
+"PICScaleYUVToRGB32Row_SSE:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x3c(%esp),%ecx\n"
+ "xor %ebx,%ebx\n"
+ "jmp 1f\n"
+
+"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "subl $0x2,0x34(%esp)\n"
+ "jns 0b\n"
+
+ "andl $0x1,0x34(%esp)\n"
+ "je 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+ &kCoefficientsRgbY[0][0]);
+ return;
+ }
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void PICLinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx,
+ const int16_t *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+#if defined(XP_MACOSX)
+"_PICLinearScaleYUVToRGB32Row_SSE:\n"
+#else
+"PICLinearScaleYUVToRGB32Row_SSE:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "mov 0x3c(%esp),%edi\n"
+ "xor %ebx,%ebx\n"
+
+ // source_width = width * source_dx + ebx
+ "mov 0x34(%esp), %ecx\n"
+ "imull 0x38(%esp), %ecx\n"
+ "mov %ecx, 0x34(%esp)\n"
+
+ "mov 0x38(%esp), %ecx\n"
+ "xor %ebx,%ebx\n" // x = 0
+ "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+ "jl 1f\n"
+ "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+ "jmp 1f\n"
+
+"0:"
+ "mov 0x28(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "movq 2048(%edi,%ecx,8),%mm0\n"
+
+ "mov 0x2c(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "paddsw 4096(%edi,%ecx,8),%mm0\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq (%edi,%ecx,8),%mm1\n"
+
+ "cmp 0x34(%esp), %ebx\n"
+ "jge 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq (%edi,%ecx,8),%mm2\n"
+
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+
+"1:"
+ "cmp %ebx, 0x34(%esp)\n"
+ "jg 0b\n"
+ "popa\n"
+ "ret\n"
+
+"2:"
+ "paddsw %mm0, %mm1\n"
+ "psraw $6, %mm1\n"
+ "packuswb %mm1, %mm1\n"
+ "movd %mm1, (%ebp)\n"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+ source_dx, &kCoefficientsRgbY[0][0]);
+ return;
+ }
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#else
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#endif
+
+}
diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
new file mode 100644
index 0000000000..a916ffde57
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_table.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+#define RGBY(i) { \
+ static_cast<int16_t>(1.164 * 64 * (i - 16) + 0.5), \
+ static_cast<int16_t>(1.164 * 64 * (i - 16) + 0.5), \
+ static_cast<int16_t>(1.164 * 64 * (i - 16) + 0.5), \
+ 0 \
+}
+
+#define RGBU(i) { \
+ static_cast<int16_t>(2.018 * 64 * (i - 128) + 0.5), \
+ static_cast<int16_t>(-0.391 * 64 * (i - 128) + 0.5), \
+ 0, \
+ static_cast<int16_t>(256 * 64 - 1) \
+}
+
+#define RGBV(i) { \
+ 0, \
+ static_cast<int16_t>(-0.813 * 64 * (i - 128) + 0.5), \
+ static_cast<int16_t>(1.596 * 64 * (i - 128) + 0.5), \
+ 0 \
+}
+
+SIMD_ALIGNED(const int16_t kCoefficientsRgbY[256 * 3][4]) = {
+ RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03),
+ RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07),
+ RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B),
+ RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F),
+ RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13),
+ RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17),
+ RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B),
+ RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F),
+ RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23),
+ RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27),
+ RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B),
+ RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F),
+ RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33),
+ RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37),
+ RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B),
+ RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F),
+ RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43),
+ RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47),
+ RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B),
+ RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F),
+ RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53),
+ RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57),
+ RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B),
+ RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F),
+ RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63),
+ RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67),
+ RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B),
+ RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F),
+ RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73),
+ RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77),
+ RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B),
+ RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F),
+ RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83),
+ RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87),
+ RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B),
+ RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F),
+ RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93),
+ RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97),
+ RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B),
+ RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F),
+ RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3),
+ RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7),
+ RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB),
+ RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF),
+ RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3),
+ RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7),
+ RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB),
+ RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF),
+ RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3),
+ RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7),
+ RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB),
+ RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF),
+ RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3),
+ RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7),
+ RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB),
+ RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF),
+ RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3),
+ RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7),
+ RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB),
+ RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF),
+ RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3),
+ RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7),
+ RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB),
+ RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF),
+
+ // Chroma U table.
+ RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03),
+ RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07),
+ RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B),
+ RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F),
+ RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13),
+ RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17),
+ RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B),
+ RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F),
+ RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23),
+ RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27),
+ RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B),
+ RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F),
+ RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33),
+ RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37),
+ RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B),
+ RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F),
+ RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43),
+ RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47),
+ RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B),
+ RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F),
+ RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53),
+ RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57),
+ RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B),
+ RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F),
+ RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63),
+ RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67),
+ RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B),
+ RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F),
+ RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73),
+ RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77),
+ RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B),
+ RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F),
+ RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83),
+ RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87),
+ RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B),
+ RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F),
+ RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93),
+ RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97),
+ RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B),
+ RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F),
+ RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3),
+ RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7),
+ RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB),
+ RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF),
+ RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3),
+ RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7),
+ RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB),
+ RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF),
+ RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3),
+ RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7),
+ RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB),
+ RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF),
+ RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3),
+ RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7),
+ RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB),
+ RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF),
+ RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3),
+ RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7),
+ RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB),
+ RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF),
+ RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3),
+ RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7),
+ RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB),
+ RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF),
+
+ // Chroma V table.
+ RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03),
+ RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07),
+ RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B),
+ RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F),
+ RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13),
+ RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17),
+ RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B),
+ RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F),
+ RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23),
+ RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27),
+ RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B),
+ RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F),
+ RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33),
+ RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37),
+ RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B),
+ RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F),
+ RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43),
+ RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47),
+ RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B),
+ RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F),
+ RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53),
+ RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57),
+ RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B),
+ RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F),
+ RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63),
+ RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67),
+ RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B),
+ RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F),
+ RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73),
+ RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77),
+ RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B),
+ RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F),
+ RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83),
+ RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87),
+ RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B),
+ RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F),
+ RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93),
+ RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97),
+ RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B),
+ RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F),
+ RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3),
+ RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7),
+ RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB),
+ RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF),
+ RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3),
+ RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7),
+ RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB),
+ RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF),
+ RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3),
+ RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7),
+ RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB),
+ RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF),
+ RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3),
+ RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7),
+ RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB),
+ RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF),
+ RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3),
+ RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7),
+ RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB),
+ RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF),
+ RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3),
+ RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7),
+ RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB),
+ RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF),
+};
+
+#undef RGBY
+#undef RGBU
+#undef RGBV
+
+} // extern "C"
diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
new file mode 100644
index 0000000000..c496b2d935
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -0,0 +1,506 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+#define kCoefficientsRgbU kCoefficientsRgbY + 2048
+#define kCoefficientsRgbV kCoefficientsRgbY + 4096
+
+extern "C" {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+#if defined(__clang__)
+// clang-cl has a bug where it doesn't mangle names in inline asm
+// so let's do the mangling in the preprocessor (ugh)
+// (but we still need to declare a dummy extern for the parser)
+extern void* _kCoefficientsRgbY;
+#define kCoefficientsRgbY _kCoefficientsRgbY
+#endif
+
+__declspec(naked)
+void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp convertend
+
+ convertloop :
+ movzx eax, byte ptr [edi]
+ add edi, 1
+ movzx ebx, byte ptr [esi]
+ add esi, 1
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [edx]
+ paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
+ movzx ebx, byte ptr [edx + 1]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ add edx, 2
+ movq mm2, [kCoefficientsRgbY + 8 * ebx]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ convertend :
+ sub ecx, 2
+ jns convertloop
+
+ and ecx, 1 // odd number of pixels?
+ jz convertdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ convertdone :
+
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int step) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ mov ebx, [esp + 32 + 24] // step
+ jmp wend
+
+ wloop :
+ movzx eax, byte ptr [edi]
+ add edi, ebx
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ add esi, ebx
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ add edx, ebx
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ movzx eax, byte ptr [edx]
+ add edx, ebx
+ movq mm2, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ wend :
+ sub ecx, 2
+ jns wloop
+
+ and ecx, 1 // odd number of pixels?
+ jz wdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ wdone :
+
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int ystep,
+ int uvstep) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp wend
+
+ wloop :
+ movzx eax, byte ptr [edi]
+ mov ebx, [esp + 32 + 28] // uvstep
+ add edi, ebx
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ add esi, ebx
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ mov ebx, [esp + 32 + 24] // ystep
+ add edx, ebx
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ movzx eax, byte ptr [edx]
+ add edx, ebx
+ movq mm2, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ wend :
+ sub ecx, 2
+ jns wloop
+
+ and ecx, 1 // odd number of pixels?
+ jz wdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ wdone :
+
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp wend
+
+ wloop :
+ movzx eax, byte ptr [edi]
+ add edi, 1
+ movzx ebx, byte ptr [esi]
+ add esi, 1
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [edx]
+ paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ punpckldq mm1, mm1
+ movntq [ebp], mm1
+
+ movzx ebx, byte ptr [edx + 1]
+ add edx, 2
+ paddsw mm0, [kCoefficientsRgbY + 8 * ebx]
+ psraw mm0, 6
+ packuswb mm0, mm0
+ punpckldq mm0, mm0
+ movntq [ebp+8], mm0
+ add ebp, 16
+ wend :
+ sub ecx, 4
+ jns wloop
+
+ add ecx, 4
+ jz wdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ jmp wend1
+
+ wloop1 :
+ movd [ebp], mm1
+ add ebp, 4
+ wend1 :
+ sub ecx, 1
+ jns wloop1
+ wdone :
+ popad
+ ret
+ }
+}
+
+// This version does general purpose scaling by any amount, up or down.
+// The only thing it cannot do is rotation by 90 or 270.
+// For performance the chroma is under-sampled, reducing cost of a 3x
+// 1080p scale from 8.4 ms to 5.4 ms.
+__declspec(naked)
+void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ xor ebx, ebx // x
+ jmp scaleend
+
+ scaleloop :
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [edi + eax]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [esi + eax]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24] // x += source_dx
+ sar eax, 16
+ movzx eax, byte ptr [edx + eax]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24] // x += source_dx
+ sar eax, 16
+ movzx eax, byte ptr [edx + eax]
+ movq mm2, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ scaleend :
+ sub ecx, 2
+ jns scaleloop
+
+ and ecx, 1 // odd number of pixels?
+ jz scaledone
+
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [edi + eax]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [esi + eax]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ mov eax, ebx
+ sar eax, 16
+ movzx eax, byte ptr [edx + eax]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+
+ scaledone :
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ // [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ imul ecx, [esp + 32 + 24] // source_dx
+ mov [esp + 32 + 20], ecx // source_width = width * source_dx
+ mov ecx, [esp + 32 + 24] // source_dx
+ xor ebx, ebx // x = 0
+ cmp ecx, 0x20000
+ jl lscaleend
+ mov ebx, 0x8000 // x = 0.5 for 1/2 or less
+ jmp lscaleend
+lscaleloop:
+ mov eax, ebx
+ sar eax, 0x11
+
+ movzx ecx, byte ptr [edi + eax]
+ movzx esi, byte ptr [edi + eax + 1]
+ mov eax, ebx
+ and eax, 0x1fffe
+ imul esi, eax
+ xor eax, 0x1fffe
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 17
+ movq mm0, [kCoefficientsRgbU + 8 * ecx]
+
+ mov esi, [esp + 32 + 12]
+ mov eax, ebx
+ sar eax, 0x11
+
+ movzx ecx, byte ptr [esi + eax]
+ movzx esi, byte ptr [esi + eax + 1]
+ mov eax, ebx
+ and eax, 0x1fffe
+ imul esi, eax
+ xor eax, 0x1fffe
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 17
+ paddsw mm0, [kCoefficientsRgbV + 8 * ecx]
+
+ mov eax, ebx
+ sar eax, 0x10
+ movzx ecx, byte ptr [edx + eax]
+ movzx esi, byte ptr [1 + edx + eax]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24]
+ and eax, 0xffff
+ imul esi, eax
+ xor eax, 0xffff
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 16
+ movq mm1, [kCoefficientsRgbY + 8 * ecx]
+
+ cmp ebx, [esp + 32 + 20]
+ jge lscalelastpixel
+
+ mov eax, ebx
+ sar eax, 0x10
+ movzx ecx, byte ptr [edx + eax]
+ movzx esi, byte ptr [edx + eax + 1]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24]
+ and eax, 0xffff
+ imul esi, eax
+ xor eax, 0xffff
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 16
+ movq mm2, [kCoefficientsRgbY + 8 * ecx]
+
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 0x6
+ psraw mm2, 0x6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 0x8
+
+lscaleend:
+ cmp ebx, [esp + 32 + 20]
+ jl lscaleloop
+ popad
+ ret
+
+lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ popad
+ ret
+ };
+}
+#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ if (mozilla::supports_sse()) {
+ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+ return;
+ }
+#endif
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ if (mozilla::supports_sse()) {
+ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+ return;
+ }
+#endif
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ if (mozilla::supports_sse()) {
+ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+ source_dx);
+ return;
+ }
+#endif
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+} // extern "C"
diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
new file mode 100644
index 0000000000..17b542449b
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win64.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
+
+#define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048)
+#define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096)
+
+#include <emmintrin.h>
+
+static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+ __m128i xmm0, xmmY1, xmmY2;
+ __m128 xmmY;
+
+ while (width >= 2) {
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
+
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+ xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
+ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+ 0x44);
+ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+ rgb_buf += 8;
+ width -= 2;
+ }
+
+ if (width) {
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+ xmmY1 = _mm_srai_epi16(xmmY1, 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+ *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+ }
+}
+
+static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ __m128i xmm0, xmmY1, xmmY2;
+ __m128 xmmY;
+ uint8_t u, v, y;
+ int x = 0;
+
+ while (width >= 2) {
+ u = u_buf[x >> 17];
+ v = v_buf[x >> 17];
+ y = y_buf[x >> 16];
+ x += source_dx;
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+ y = y_buf[x >> 16];
+ x += source_dx;
+
+ xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
+ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+ 0x44);
+ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+ rgb_buf += 8;
+ width -= 2;
+ }
+
+ if (width) {
+ u = u_buf[x >> 17];
+ v = v_buf[x >> 17];
+ y = y_buf[x >> 16];
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+ xmmY1 = _mm_srai_epi16(xmmY1, 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+ *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+ }
+}
+
+static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ __m128i xmm0, xmmY1, xmmY2;
+ __m128 xmmY;
+ uint8_t u0, u1, v0, v1, y0, y1;
+ uint32_t uv_frac, y_frac, u, v, y;
+ int x = 0;
+
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+
+ while(width >= 2) {
+ u0 = u_buf[x >> 17];
+ u1 = u_buf[(x >> 17) + 1];
+ v0 = v_buf[x >> 17];
+ v1 = v_buf[(x >> 17) + 1];
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ uv_frac = (x & 0x1fffe);
+ y_frac = (x & 0xffff);
+ u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
+ v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
+ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+ x += source_dx;
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ y_frac = (x & 0xffff);
+ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+ x += source_dx;
+
+ xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
+ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+ 0x44);
+ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+ rgb_buf += 8;
+ width -= 2;
+ }
+
+ if (width) {
+ u = u_buf[x >> 17];
+ v = v_buf[x >> 17];
+ y = y_buf[x >> 16];
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
+
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+ xmmY1 = _mm_srai_epi16(xmmY1, 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+ *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+ }
+}
+
+void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width) {
+ FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
+}
+
+void ScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb_buf,
+ int width,
+ int source_dx) {
+ LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
+ source_dx);
+}
+
+} // extern "C"